In [1]:
#USING XGBOOST MODEL

import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# USING FUNCTION TO HANDLE_OUTLIERS
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5*IQR
    upper_bound = Q3 + 1.5*IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\TELECOM CHURN DATASET\telco.csv")

# REPLACE EMTY STRING WITH NUN AND DROP ROWS WITH MISSING VALUES
df = df.replace(' ', pd.NA)
df.dropna(inplace=True)

#DROP UNNEED COLUMN
df = df.drop(['customerID'], axis=1)

# SELECT NUMERICAL COLUMN
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# # IMPUTE MISSING VALUES USING KNN IMPUTATION
imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# CONVERT CATEGORICAL VALUE TO NUMERICAL
categorical_cols = ['Churn', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                    'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# HANDLE OUTLIER
df = handle_outliers(df, 'Churn')

# REMOVE DUPLICATES ROWS
df.drop_duplicates(inplace=True)

# NORMALIZE AND SCALE ALL NUMERICAL COLUMN
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# SPLIT DATA 
X = df.drop('Churn', axis=1)
y = df['Churn']

# SPLIT DATA INTO TRAINING AND TEST SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBOOST MODEL
model = XGBClassifier()
model.fit(X_train, y_train)

# PRDEICTION
y_pred = model.predict(X_test)

# MODEL ACCURACY
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.7988587731811697


In [2]:
# XGBOOST MODEL PREDICT AND COMPUTE PERCENTAGE NON CHURN AND CHURN  FOR EACH SERVICE CATEGORY..

import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# USING FUNCTION TO HANDLE_OUTLIERS
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5*IQR
    upper_bound = Q3 + 1.5*IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\TELECOM CHURN DATASET\telco.csv")

# REPLACE EMTY STRING WITH NUN AND DROP ROWS WITH MISSING VALUES
df = df.replace(' ', pd.NA)
df.dropna(inplace=True)

#DROP UNNEED COLUMN
df = df.drop(['customerID'], axis=1)

# SELECT NUMERICAL COLUMN
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# # IMPUTE MISSING VALUES USING KNN IMPUTATION
imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# CONVERT CATEGORICAL VALUE TO NUMERICAL
categorical_cols = ['Churn', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                    'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# HANDLE OUTLIER
df = handle_outliers(df, 'Churn')

# REMOVE DUPLICATES ROWS
df.drop_duplicates(inplace=True)

# NORMALIZE AND SCALE ALL NUMERICAL COLUMN
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# SPLIT DATA 
X = df.drop('Churn', axis=1)
y = df['Churn']

# SPLIT DATA INTO TRAINING AND TEST SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBOOST MODEL
model = XGBClassifier()
model.fit(X_train, y_train)

# PRDEICTION
y_pred = model.predict(X_test)

# MODEL ACCURACY
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


# COMPUTE PERCENTAGE NON CHURN AND CHURN  FOR EACH SERVICE CATEGORY
service_categories = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                      'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

for category in service_categories:
    churn_percent = df[df['Churn'] == 1][category].sum() / df[category].sum() * 100
    non_churn_percent = df[df['Churn'] == 0][category].sum() / df[category].sum() * 100
    print(f"{category}:")
    print("Churn Percentage:", churn_percent)
    print("Non-Churn Percentage:", non_churn_percent)
    print()

Accuracy: 0.7988587731811697
PhoneService:
Churn Percentage: 26.65086887835703
Non-Churn Percentage: 73.34913112164297

MultipleLines:
Churn Percentage: 28.273359540368915
Non-Churn Percentage: 71.72664045963108

InternetService:
Churn Percentage: 24.72959685349066
Non-Churn Percentage: 75.27040314650935

OnlineSecurity:
Churn Percentage: 12.626445086705202
Non-Churn Percentage: 87.3735549132948

OnlineBackup:
Churn Percentage: 18.17180616740088
Non-Churn Percentage: 81.82819383259911

DeviceProtection:
Churn Percentage: 18.90570797855566
Non-Churn Percentage: 81.09429202144433

TechSupport:
Churn Percentage: 13.050483351235231
Non-Churn Percentage: 86.94951664876477

StreamingTV:
Churn Percentage: 25.130208333333332
Non-Churn Percentage: 74.86979166666666

StreamingMovies:
Churn Percentage: 25.043053960964407
Non-Churn Percentage: 74.95694603903559



In [3]:
#USING RNNs MODEL

import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.metrics import accuracy_score

# USING FUNCTION TO HANDLE OUTLIERS
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5*IQR
    upper_bound = Q3 + 1.5*IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\TELECOM CHURN DATASET\telco.csv")

# REPLACE EMPTY STRING WITH NaN AND DROP ROWS WITH MISSING VALUES
df = df.replace(' ', pd.NA)
df.dropna(inplace=True)

# DROP COLUMN
df = df.drop('customerID', axis=1)

# SELECT NUMERICAL COLUMN
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# IMPUTE MISSING VALUES USING KNN IMPUTATION
imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# CONVERT CATEGORICAL VALUES TO NUMERICAL
categorical_cols = ['Churn', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                    'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# HANDLE OUTLIER
df = handle_outliers(df, 'Churn')

# REMOVE DUPLICATES ROWS
df.drop_duplicates(inplace=True)

# NORMALIZE AND SCALE ALL NUMERICAL COLUMNS
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# FEATURE AND TAGET VARIABLE
X = df.drop('Churn', axis=1)
y = df['Churn']

# SPLIT DATA INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape input data for LSTM
X_train = X_train.values.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.values.reshape(X_test.shape[0], 1, X_test.shape[1])

#RNNs MODEL
model = Sequential()
model.add(LSTM(64, input_shape=(1, X_train.shape[2])))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=32)

# PREDICTION
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

#MODEL ACCURACY
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [6]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.metrics import accuracy_score

# USING FUNCTION TO HANDLE OUTLIERS
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5*IQR
    upper_bound = Q3 + 1.5*IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\TELECOM CHURN DATASET\telco.csv")

# REPLACE EMPTY STRING WITH NaN AND DROP ROWS WITH MISSING VALUES
df = df.replace(' ', pd.NA)
df.dropna(inplace=True)

# DROP COLUMN
df = df.drop('customerID', axis=1)

# SELECT NUMERICAL COLUMN
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# IMPUTE MISSING VALUES USING KNN IMPUTATION
imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# CONVERT CATEGORICAL VALUES TO NUMERICAL
categorical_cols = ['Churn', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                    'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# HANDLE OUTLIER
df = handle_outliers(df, 'Churn')

# REMOVE DUPLICATES ROWS
df.drop_duplicates(inplace=True)

# NORMALIZE AND SCALE ALL NUMERICAL COLUMNS
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# FEATURE AND TAGET VARIABLE
X = df.drop('Churn', axis=1)
y = df['Churn']

# SPLIT DATA INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape input data for LSTM
X_train = X_train.values.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.values.reshape(X_test.shape[0], 1, X_test.shape[1])

#RNNs MODEL
model = Sequential()
model.add(LSTM(64, input_shape=(1, X_train.shape[2])))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=32)

# PREDICTION
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

#MODEL ACCURACY
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# COMPUTE PERCENTAGE NON CHURN AND CHURN  FOR EACH SERVICE CATEGORY
categorical_cols.remove('Churn')
for col in categorical_cols:
    churn_percent = df[df['Churn'] == 1][col].value_counts(normalize=True) * 100
    non_churn_percent = df[df['Churn'] == 0][col].value_counts(normalize=True) * 100
    
    print("Category:", col)
    print("Churn Percentage:")
    print(churn_percent)
    print("Non-Churn Percentage:")
    print(non_churn_percent)
    print()


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78