In [434]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
from sklearn import preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from matplotlib.image import imread
from sklearn.metrics import confusion_matrix,accuracy_score, f1_score, auc, plot_roc_curve,ConfusionMatrixDisplay
from sklearn.metrics import roc_curve,roc_auc_score, balanced_accuracy_score,classification_report
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.metrics import plot_confusion_matrix
import pickle

pd.set_option('display.max_columns', 500)
sns.set()

sns.set_style('darkgrid')

In [435]:
df = pd.read_csv('Customer-Churn.csv')

In [436]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [437]:
train,test = train_test_split(df,test_size=0.2,random_state=123)

In [438]:
#A1
train.reset_index(drop=True, inplace=True)

In [439]:
train.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,6457-GIRWB,Male,0,No,No,1,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,69.35,69.35,Yes
1,5115-GZDEL,Male,0,No,Yes,72,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic),67.2,4671.7,No
2,3398-ZOUAA,Male,1,Yes,No,21,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,69.1,1474.75,Yes
3,7011-CVEUC,Male,0,Yes,No,25,Yes,No,Fiber optic,No,No,No,Yes,Yes,Yes,Month-to-month,Yes,Credit card (automatic),95.7,2338.35,No
4,3836-FZSDJ,Male,1,Yes,No,71,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),24.85,1901.0,No


In [440]:
print(train.shape)
print(test.shape)
print(df.shape)

(5634, 21)
(1409, 21)
(7043, 21)


In [441]:
train.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [442]:
#A2
train.drop(['customerID'],axis=1,inplace=True)

In [443]:
train.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [444]:
train.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,No,No,1,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,69.35,69.35,Yes
1,Male,0,No,Yes,72,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic),67.2,4671.7,No
2,Male,1,Yes,No,21,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,69.1,1474.75,Yes
3,Male,0,Yes,No,25,Yes,No,Fiber optic,No,No,No,Yes,Yes,Yes,Month-to-month,Yes,Credit card (automatic),95.7,2338.35,No
4,Male,1,Yes,No,71,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),24.85,1901.0,No


In [445]:
train['TotalCharges'].unique()

array(['69.35', '4671.7', '1474.75', ..., '3409.1', '1258.6', '232.35'],
      dtype=object)

In [446]:
train[train['TotalCharges'] == ' ']

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
1343,Female,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
2371,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
3880,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
4054,Male,0,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
4189,Male,0,No,Yes,0,Yes,Yes,DSL,Yes,Yes,No,Yes,No,No,Two year,Yes,Bank transfer (automatic),61.9,,No
5039,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
5528,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
5629,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No


In [447]:
for i in train['TotalCharges'].values:
    if i == ' ':
        train['TotalCharges'] = train['TotalCharges'].apply(lambda x:x )

In [448]:
#A3
train['TotalCharges'] = train['TotalCharges'].replace(' ',np.nan) 

In [449]:
train.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        8
Churn               0
dtype: int64

In [450]:
#A4
train['TotalCharges'] = train['TotalCharges'].astype(float)

In [451]:
from sklearn.impute import SimpleImputer,KNNImputer

In [452]:
cat = []
for i in train.columns:
    if train[i].dtype == 'O':
        cat.append(i)

In [453]:
print(cat)

['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']


In [454]:
train.duplicated().sum()

14

In [455]:
#A5
train.drop_duplicates(inplace=True,keep='first',ignore_index=True)

In [456]:
train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,No,No,1,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,69.35,69.35,Yes
1,Male,0,No,Yes,72,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic),67.20,4671.70,No
2,Male,1,Yes,No,21,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,69.10,1474.75,Yes
3,Male,0,Yes,No,25,Yes,No,Fiber optic,No,No,No,Yes,Yes,Yes,Month-to-month,Yes,Credit card (automatic),95.70,2338.35,No
4,Male,1,Yes,No,71,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),24.85,1901.00,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5615,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.70,,No
5616,Male,0,Yes,Yes,54,Yes,Yes,DSL,No,No,Yes,No,No,Yes,Two year,No,Credit card (automatic),63.35,3409.10,No
5617,Female,0,Yes,Yes,14,Yes,Yes,Fiber optic,No,No,Yes,No,No,Yes,Month-to-month,Yes,Electronic check,87.25,1258.60,Yes
5618,Female,1,Yes,Yes,29,No,No phone service,DSL,No,Yes,Yes,No,No,No,Month-to-month,Yes,Mailed check,35.65,1025.15,No


In [457]:
#A6
train = train.dropna().reset_index(drop=True)

In [458]:
for i in cat:
    if train[i].dtype == 'O':
        print(f'{i} has {train[i].nunique()} categories\nThey are: \n{train[i].unique()}\n')

gender has 2 categories
They are: 
['Male' 'Female']

Partner has 2 categories
They are: 
['No' 'Yes']

Dependents has 2 categories
They are: 
['No' 'Yes']

PhoneService has 2 categories
They are: 
['Yes' 'No']

MultipleLines has 3 categories
They are: 
['No' 'No phone service' 'Yes']

InternetService has 3 categories
They are: 
['Fiber optic' 'DSL' 'No']

OnlineSecurity has 3 categories
They are: 
['No' 'Yes' 'No internet service']

OnlineBackup has 3 categories
They are: 
['No' 'Yes' 'No internet service']

DeviceProtection has 3 categories
They are: 
['No' 'Yes' 'No internet service']

TechSupport has 3 categories
They are: 
['No' 'Yes' 'No internet service']

StreamingTV has 3 categories
They are: 
['No' 'Yes' 'No internet service']

StreamingMovies has 3 categories
They are: 
['No' 'Yes' 'No internet service']

Contract has 3 categories
They are: 
['Month-to-month' 'Two year' 'One year']

PaperlessBilling has 2 categories
They are: 
['Yes' 'No']

PaymentMethod has 4 categories
The

In [459]:
#for i in cat:
    #if train[i].dtype == 'O':
        #train[i] = pd.Categorical(train[i]).codes

In [460]:
#imp = KNNImputer(n_neighbors=3)

#train_imp = pd.DataFrame(imp.fit_transform(train),columns=train.columns)

In [461]:
#for i in cat:
#        train[i] = pd.Categorical.from_codes(train_imp[i],train[i].cat.categories)

In [462]:
col = train.columns

In [463]:
#A7
X_train = train.drop(['Churn'],axis=1)
y_train = train['Churn']

In [464]:
cat1 = []
for i in X_train.columns:
    if X_train[i].dtype == 'O':
        cat1.append(i)

In [465]:
#A8
encoder =  OneHotEncoder(drop='first',dtype=int)
enc_data =pd.DataFrame(encoder.fit_transform(X_train[cat1]).toarray())
#enc_data.columns = encoder.get_feature_names_out()
enc_data.columns = encoder.get_feature_names(cat1)
X_train = X_train.join(enc_data)

X_train.drop(cat1,axis=1,inplace=True)



In [466]:
col1 = X_train.columns

In [467]:
#A9
scaler =  MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train,columns=col1)

In [None]:
plt.figure(figsize =(24,24))
cors = X_train.corr()
sns.heatmap(cors,annot=True,cmap='YlGnBu')

<AxesSubplot:>

In [None]:
mask = cors[(cors >= 0.85) | (cors <= -0.85)]

plt.figure(figsize=(24,24))
sns.heatmap(mask, vmin=-1,vmax=1, annot=True, fmt='0.2f')

chi square for categorical variable


In [None]:
dt = DecisionTreeClassifier(max_depth= 10)
dt.fit(X_train,y_train)
predictors = col1

coef = pd.Series(dt.feature_importances_, predictors).sort_values(ascending=False)
coef.to_frame()

In [None]:
def prepare(df):
    df.reset_index(drop=True, inplace=True)
    df.drop(['customerID'],axis=1,inplace=True)
    df['TotalCharges'] = df['TotalCharges'].replace(' ',np.nan)
    df['TotalCharges'] = df['TotalCharges'].astype(float)
    df.drop_duplicates(inplace=True,keep='first',ignore_index=True)
    df = df.dropna().reset_index(drop=True)
    X_test = df.drop(['Churn'],axis=1)
    y_test = df['Churn']
    cat1 = []
    for i in X_test.columns:
        if X_test[i].dtype == 'O':
            cat1.append(i)
    enc_data =pd.DataFrame(encoder.transform(X_test[cat1]).toarray())
    #enc_data.columns = encoder.get_feature_names_out()
    enc_data.columns = encoder.get_feature_names(cat1)
    X_test = X_test.join(enc_data)

    X_test.drop(cat1,axis=1,inplace=True)
    col1 = X_test.columns
    X_test = scaler.transform(X_test)
    X_test = pd.DataFrame(X_test,columns=col1)
    return X_test, y_test
    

In [None]:
X_test, y_test = prepare(test)

In [None]:
def model_to_use(input_ml_algo):
    if input_ml_algo == 'DT':
        model = DecisionTreeClassifier()
    elif input_ml_algo == 'RF':
        model = RandomForestClassifier()
    elif input_ml_algo == 'XGBC':
        model = XGBClassifier()
    elif input_ml_algo == 'LGBMC':
        model = LGBMClassifier()
    elif input_ml_algo=='LR':
        model=LogisticRegression()
    elif input_ml_algo=='KNN':
        model=KNeighborsClassifier()
    return model

In [None]:
def performance(model,X_train,y_train,X_test, y_test):
    y_pred = model.predict(X_test)

    # Predict probability for test dataset
    y_pred_prob = model.predict_proba(X_test)
    y_pred_prob = [x[1] for x in y_pred_prob]

    disp = ConfusionMatrixDisplay.from_estimator(
    model, X_test, y_test, 
    cmap='Blues', values_format='d',
    display_labels=['No','Yes'])

    print("\n Accuracy Score : \n ",accuracy_score(y_test,y_pred))
    print("\n AUC Score : \n", roc_auc_score(y_test, y_pred_prob))
    print("\n Confusion Matrix : \n ",confusion_matrix(y_test, y_pred))
    print("\n Classification Report : \n",classification_report(y_test, y_pred))

    print("\n ROC curve : \n")
    sns.set_style("white")
    plot_roc_curve(model, X_test, y_test)
    plt.show()

In [None]:
model1 = model_to_use('LR')
model1.fit(X_train,y_train)
performance(model1,X_train,y_train,X_test,y_test)
#plot_confusion_matrix(model1, X_test, y_test)

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
ros = RandomOverSampler()
X_ros,y_ros  = ros.fit_resample(X_train,y_train)

In [None]:
model1 = model_to_use('LR')
model1.fit(X_train,y_train)
performance(model1,X_ros,y_ros,X_test,y_test)
#plot_confusion_matrix(model1, X_test, y_test)

In [None]:
model3 = model_to_use('DT')
model3.fit(X_train,y_train)
performance(model3,X_ros,y_ros,X_test,y_test)
#plot_confusion_matrix(model1, X_test, y_test)

In [None]:
model4 = model_to_use('RF')
model4.fit(X_train,y_train)
performance(model4,X_ros,y_ros,X_test,y_test)
#plot_confusion_matrix(model1, X_test, y_test)

In [None]:
model5 = model_to_use('KNN')
model5.fit(X_train,y_train)
performance(model5,X_ros,y_ros,X_test,y_test)
#plot_confusion_matrix(model1, X_test, y_test)

In [None]:
test

In [None]:
new_data = df.drop(['Churn'],axis= 1)

In [None]:
new_data.to_csv('new_data.csv',index=False)

In [None]:
new_data1 = new_data.copy()

In [None]:
def prep(df):
    df.reset_index(drop=True, inplace=True)
    df['TotalCharges'] = df['TotalCharges'].replace(' ',np.nan)
    df['TotalCharges'] = df['TotalCharges'].astype(float)
    df.drop_duplicates(inplace=True,keep='first',ignore_index=True)
    df = df.dropna().reset_index(drop=True)
    cust = df['customerID']
    df.drop(['customerID'],axis=1,inplace=True)
    #df = df.drop(['Churn'],axis=1)
    #y_test = df['Churn']
    cat1 = []
    for i in df.columns:
        if df[i].dtype == 'O':
            cat1.append(i)
    enc_data =pd.DataFrame(encoder.transform(df[cat1]).toarray())
    #enc_data.columns = encoder.get_feature_names_out()
    enc_data.columns = encoder.get_feature_names_out()
    df = df.join(enc_data)

    df.drop(cat1,axis=1,inplace=True)
    col1 = df.columns
    #scaler =  MinMaxScaler()
    df = scaler.transform(df)
    df = pd.DataFrame(df,columns=col1)
    return cust, df
    

In [None]:
cust , c_data = prep(new_data1)

In [None]:
c_data.head()

In [None]:
new_data

In [None]:
pred = model4.predict(c_data)
pred

In [None]:
results = pd.DataFrame({'Cust_ID':cust,'Churn_pred':pred})

In [None]:
results

In [None]:
targ_cust = results[results['Churn_pred'] == 'Yes'].reset_index(drop=True)['Cust_ID']

In [None]:
targ_cust.to_frame()

In [None]:
#import pickle
from pickle import dump
#save the model
dump(model3, open('RF_class_model.pkl','wb'))

#save encoder
dump(encoder, open('enc_class.pkl', 'wb'))

#save scaler
dump(scaler, open ('scal_class.pkl', "wb"))

In [None]:
%%writefile churn.py

import streamlit as st
import pandas as pd
import numpy as np
import pickle

model = pickle.load(open('RF_class_model.pkl','rb'))
scaler = pickle.load(open('scal_class.pkl', 'rb'))
encoder = pickle.load(open ('enc_class.pkl', "rb"))

df = st.file_uploader('upload a csv',type='csv')


#if(not df):
   # st.info('The prediction will begin, once you upload your data set')
   # st.stop()
    
if df is not None:
    #read csv file into a dataframe
    df = pd.read_csv(df)
else:
    st.stop()

def prep(df):
    #df.reset_index(drop=True, inplace=True)
    df['TotalCharges'] = df['TotalCharges'].replace(' ',np.nan)
    df['TotalCharges'] = df['TotalCharges'].astype(float)
    df.drop_duplicates(inplace=True,keep='first',ignore_index=True)
    df = df.dropna().reset_index(drop=True)
    cust = df['customerID']
    df.drop(['customerID'],axis=1,inplace=True)
    #df = df.drop(['Churn'],axis=1)
    #y_test = df['Churn']
    cat1 = []
    for i in df.columns:
        if df[i].dtype == 'O':
            cat1.append(i)
    enc_data =pd.DataFrame(encoder.transform(df[cat1]).toarray())
    #enc_data.columns = encoder.get_feature_names_out()
    enc_data.columns = encoder.get_feature_names(cat1)
    df = df.join(enc_data)

    df.drop(cat1,axis=1,inplace=True)
    col1 = df.columns
    df = scaler.transform(df)
    df = pd.DataFrame(df,columns=col1)
    return cust, df

cust , c_data = prep(df)
pred = model.predict(c_data)
results = pd.DataFrame({'Cust_ID':cust,'Churn_pred':pred})
targ_cust = results[results['Churn_pred'] == 'Yes'].reset_index(drop=True)['Cust_ID']

c1,c2 = st.columns(2)

with c1:
    if st.button('Prediction'):
        st.dataframe(results)
        csv1 = results.to_csv(index=False)
        st.download_button('Download Predictions', csv1,file_name='predictions.csv')
        
with c2:
    if st.button('Churn Customers'):
        st.dataframe(targ_cust)
        csv2 = targ_cust.to_csv(index=False)
        st.download_button('Download Target customer list',csv2, file_name = 'churn_cust.csv')
    