In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.linear_model import Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#https://www.kaggle.com/code/bandiatindra/telecom-churn-prediction/notebook
#https://www.kaggle.com/code/khaledeladawy/prediction-analysis-telco-customer-churn

In [3]:
df = pd.read_csv("TCC.csv")

In [4]:
df.shape

(7043, 21)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [8]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [9]:
df = df.drop(columns=["customerID","SeniorCitizen","PhoneService",'MonthlyCharges','TotalCharges'])

In [9]:
df.head()

Unnamed: 0,gender,Partner,Dependents,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,Female,Yes,No,1,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,No
1,Male,No,No,34,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,No
2,Male,No,No,2,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,Yes
3,Male,No,No,45,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),No
4,Female,No,No,2,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,Yes


In [10]:
df.shape

(7043, 16)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   gender            7043 non-null   object
 1   Partner           7043 non-null   object
 2   Dependents        7043 non-null   object
 3   tenure            7043 non-null   int64 
 4   MultipleLines     7043 non-null   object
 5   InternetService   7043 non-null   object
 6   OnlineSecurity    7043 non-null   object
 7   OnlineBackup      7043 non-null   object
 8   DeviceProtection  7043 non-null   object
 9   TechSupport       7043 non-null   object
 10  StreamingTV       7043 non-null   object
 11  StreamingMovies   7043 non-null   object
 12  Contract          7043 non-null   object
 13  PaperlessBilling  7043 non-null   object
 14  PaymentMethod     7043 non-null   object
 15  Churn             7043 non-null   object
dtypes: int64(1), object(15)
memory usage: 880.5+ KB


In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import pickle
import json
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer

In [528]:
# trans = make_column_transformer([
#     (OneHotEncoder(sparse=False,drop="first"),[[0,1,2,4,5,6,7,8,9,10,11,12,13,14]]),
#     (MinMaxScaler(),[3])
# ])

In [13]:
obj_cols = [col for col in df.columns if df[col].dtype=="O"][:-1]

In [14]:
obj_cols

['gender',
 'Partner',
 'Dependents',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [15]:
num_cols = [col for col in df.columns if df[col].dtype!="O"]

In [16]:
num_cols

['tenure']

In [15]:
trans = ColumnTransformer(
    [("ord",OneHotEncoder(sparse=False,handle_unknown="ignore", drop="first"),['gender',
 'Partner',
 'Dependents',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']),
     ("scaler",StandardScaler(),['tenure'])],remainder="passthrough")

In [16]:
lr = LogisticRegression()

In [17]:
pipe = Pipeline([
    ("clm_tra",trans),
    ("model",lr)
])

In [18]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=["Churn"]),df["Churn"],test_size=0.2,random_state=2)

In [56]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5634 entries, 4169 to 2575
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   gender            5634 non-null   object
 1   Partner           5634 non-null   object
 2   Dependents        5634 non-null   object
 3   tenure            5634 non-null   int64 
 4   MultipleLines     5634 non-null   object
 5   InternetService   5634 non-null   object
 6   OnlineSecurity    5634 non-null   object
 7   OnlineBackup      5634 non-null   object
 8   DeviceProtection  5634 non-null   object
 9   TechSupport       5634 non-null   object
 10  StreamingTV       5634 non-null   object
 11  StreamingMovies   5634 non-null   object
 12  Contract          5634 non-null   object
 13  PaperlessBilling  5634 non-null   object
 14  PaymentMethod     5634 non-null   object
dtypes: int64(1), object(14)
memory usage: 704.2+ KB


In [20]:
pipe.fit(X_train,y_train)



In [21]:
y_pred = pipe.predict(X_train)
acc = accuracy_score(y_train,y_pred)

In [61]:
acc

0.8035143769968051

In [22]:
model = pickle.dump(pipe,open("model.pkl","wb"))

In [23]:
mod = pickle.load(open("model.pkl","rb"))

In [24]:
test_data = pd.DataFrame([['Male', 'Yes', 'Yes', 34, 'No phone service', 'DSL', 'Yes', 'No',
       'Yes', 'Yes', 'Yes', 'Yes', 'Two year', 'No',
       'Bank transfer (automatic)']],columns=X_train.columns)

In [78]:
# arr = np.array([['Male', 'Yes', 'Yes', 72, 'No phone service', 'DSL', 'Yes', 'No',
#        'Yes', 'Yes', 'Yes', 'Yes', 'Two year', 'No',
#        'Bank transfer (automatic)']])

In [25]:
mod.predict(test_data)

array(['No'], dtype=object)

In [27]:
columns = {"Columns":list(X_train.columns)}

In [28]:
columns

{'Columns': ['gender',
  'Partner',
  'Dependents',
  'tenure',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod']}

In [89]:
json.dump(columns,open("columns.json","w"))