In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
#to view all cols of the df
pd.set_option('display.max_columns', None)

In [3]:
#read the data
df = pd.read_csv('churn_data.csv')

In [4]:
df.head(20)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Unnamed: 13,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,No phone service,DSL,No,Yes,No,No,No,No,,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,No,DSL,Yes,No,Yes,No,No,No,,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,No,DSL,Yes,Yes,No,No,No,No,,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,No,Fiber optic,No,No,No,No,No,No,,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,,8,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,,22,Yes,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,No phone service,DSL,Yes,No,No,No,No,No,,10,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,,28,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,No,DSL,Yes,Yes,No,No,No,No,,62,Yes,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [5]:
# i found that there is  unimportant column (unnamed 13 ) its a noise column because all its values (nan)
#the customer id is like his name so i cant remove this column
# so i will drop  (unnamed 13 )
# i will change all columns to lowercase

In [6]:
df['Unnamed: 13'].unique()

array([nan])

In [7]:

df.drop('Unnamed: 13',axis=1,inplace= True)

In [8]:
df.columns = df.columns.str.lower()

## first step is data under standing 

##### SeniorCitizen: Flags whether the customer is a senior citizen (1) or not (0).

##### Partner: Shows if the customer has a partner (Yes/No).

##### Dependents: Shows if the customer has dependents such as children or parents (Yes/No).

##### MultipleLines: Indicates whether the customer has multiple phone lines or not.

##### InternetService: Type of internet service the customer subscribes to (DSL/Fiber optic/No).

##### OnlineSecurity: Indicates if the customer has an additional online security service.

##### OnlineBackup: Indicates if the customer uses an online backup service.

##### DeviceProtection: Indicates if the customer has device/equipment protection service.

##### TechSupport: Indicates if the customer has subscribed to extra technical support.

##### StreamingTV: Indicates if the customer has a streaming TV service from the provider.

##### StreamingMovies: Indicates if the customer has a streaming movies service.

##### Unnamed: 13: Empty/unused column likely created accidentally .

##### tenure: Number of months the customer has stayed with the company.

##### PhoneService: Indicates whether the customer has a phone service.

##### Contract: Type of contract the customer has (month-to-month, one year, two year).

##### PaperlessBilling: Indicates whether the customer receives electronic (paperless) bills.

##### PaymentMethod: Method used by the customer to pay their bill (e.g. electronic check, bank transfer).

##### MonthlyCharges: Current monthly amount charged to the customer.

##### TotalCharges: Total amount billed to the customer over their entire tenure.

 ##### Churn: Indicates whether the customer has left (churned) or is still active.

# data exploration 


In [9]:
# i found that the total charges must be float and i have to convert the col of seniorcitizen to be object so i can check the summary statistics
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7042 entries, 0 to 7041
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerid        7042 non-null   object 
 1   gender            7042 non-null   object 
 2   seniorcitizen     7042 non-null   int64  
 3   partner           7042 non-null   object 
 4   dependents        7042 non-null   object 
 5   multiplelines     7042 non-null   object 
 6   internetservice   7042 non-null   object 
 7   onlinesecurity    7042 non-null   object 
 8   onlinebackup      7042 non-null   object 
 9   deviceprotection  7042 non-null   object 
 10  techsupport       7042 non-null   object 
 11  streamingtv       7042 non-null   object 
 12  streamingmovies   7042 non-null   object 
 13  tenure            7042 non-null   int64  
 14  phoneservice      7042 non-null   object 
 15  contract          7042 non-null   object 
 16  paperlessbilling  7042 non-null   object 


In [10]:
# this function to fix the data type of monthly charges
def convert_float(x):
    if x == ' ':
        return 0
    else:
        return float(x)


In [11]:
df['totalcharges'] = df['totalcharges'].apply(convert_float)

In [12]:
#to check first if there is any nans
df['seniorcitizen'].unique()

array([0, 1], dtype=int64)

In [13]:
def convert_object(x):
    if x == 0:
        return 'old'
    elif x == 1:
        return 'youth'
    else:
        return 'unkmown'


In [14]:
df['seniorcitizen']=df['seniorcitizen'].apply(convert_object)

In [15]:
# summary statistics check for numirical
df.describe(include='number')

Unnamed: 0,tenure,monthlycharges,totalcharges
count,7042.0,7042.0,7042.0
mean,32.366373,64.755886,2279.086083
std,24.557955,30.088238,2266.302524
min,0.0,18.25,0.0
25%,9.0,35.5,398.55
50%,29.0,70.35,1394.075
75%,55.0,89.85,3783.6
max,72.0,118.75,8684.8


In [16]:
#summary statistics  for categorical
df.describe(include='object')

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,phoneservice,contract,paperlessbilling,paymentmethod,churn
count,7042,7042,7042,7042,7042,7042,7042,7042,7042,7042,7042,7042,7042,7042,7042,7042,7042,7042
unique,7042,2,2,2,2,3,3,3,3,3,3,3,3,2,3,2,4,2
top,7590-VHVEG,Male,old,No,No,No,Fiber optic,No,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,No
freq,1,3554,5900,3640,4932,3389,3095,3498,3087,3095,3473,2810,2785,6360,3875,4170,2365,5173


In [17]:
#check the duplicates
df.duplicated().sum()

0

In [18]:
# there is no missing values
df.isna().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
tenure              0
phoneservice        0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

# data cleaning

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7042 entries, 0 to 7041
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerid        7042 non-null   object 
 1   gender            7042 non-null   object 
 2   seniorcitizen     7042 non-null   object 
 3   partner           7042 non-null   object 
 4   dependents        7042 non-null   object 
 5   multiplelines     7042 non-null   object 
 6   internetservice   7042 non-null   object 
 7   onlinesecurity    7042 non-null   object 
 8   onlinebackup      7042 non-null   object 
 9   deviceprotection  7042 non-null   object 
 10  techsupport       7042 non-null   object 
 11  streamingtv       7042 non-null   object 
 12  streamingmovies   7042 non-null   object 
 13  tenure            7042 non-null   int64  
 14  phoneservice      7042 non-null   object 
 15  contract          7042 non-null   object 
 16  paperlessbilling  7042 non-null   object 


In [20]:
#i dont need customer id so i will remove
df.drop('customerid',axis=1,inplace=True)

In [21]:
#in depth checking for categorical cols
cat_col = df.select_dtypes(include='object').columns

In [22]:
for col in cat_col:
    print(col)
    print(df[col].nunique())
    print(df[col].unique())

gender
2
['Female' 'Male']
seniorcitizen
2
['old' 'youth']
partner
2
['Yes' 'No']
dependents
2
['No' 'Yes']
multiplelines
3
['No phone service' 'No' 'Yes']
internetservice
3
['DSL' 'Fiber optic' 'No']
onlinesecurity
3
['No' 'Yes' 'No internet service']
onlinebackup
3
['Yes' 'No' 'No internet service']
deviceprotection
3
['No' 'Yes' 'No internet service']
techsupport
3
['No' 'Yes' 'No internet service']
streamingtv
3
['No' 'Yes' 'No internet service']
streamingmovies
3
['No' 'Yes' 'No internet service']
phoneservice
2
['No' 'Yes']
contract
3
['Month-to-month' 'One year' 'Two year']
paperlessbilling
2
['Yes' 'No']
paymentmethod
4
['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
churn
2
['No' 'Yes']


In [23]:
# after i checked deeply i found that the col of multiple lines i could deal with the people with no phone service the are same 
#like they dont have so i will convert them to no multiple lines
# &
#(multiple lines - online security - online backup -device protection - tech support - stream tv - streaming movies )
# theese col there is some no internet service values and that mean they dont have internet service so they are not subscriping the services
# so i WILL cahnge them to be no  

In [24]:
#this fucnc for the multiplelines col
def covert_unknown1(x):
    if x == 'No phone service':
        return 'unknown'
    else:
        return x

In [25]:
df['multiplelines']= df['multiplelines'].apply(covert_unknown1)

In [26]:
#this fucnc for the the other 6 col
def covert_unknown2(x):
    if x == 'No internet service':
        return 'No'
    else:
        return x

In [27]:
columnn = df[['onlinesecurity', 'onlinebackup','deviceprotection','techsupport','streamingtv','streamingmovies']].columns

In [28]:
# to apply to the other 6 colmuns
for c in columnn:
    df[c] = df[c].apply(covert_unknown2)

In [29]:
#now i will make a indepth checking for the num cols
df

Unnamed: 0,gender,seniorcitizen,partner,dependents,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,tenure,phoneservice,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,Female,old,Yes,No,unknown,DSL,No,Yes,No,No,No,No,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,old,No,No,No,DSL,Yes,No,Yes,No,No,No,34,Yes,One year,No,Mailed check,56.95,1889.50,No
2,Male,old,No,No,No,DSL,Yes,Yes,No,No,No,No,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,old,No,No,unknown,DSL,Yes,No,Yes,Yes,No,No,45,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,old,No,No,No,Fiber optic,No,No,No,No,No,No,2,Yes,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7037,Female,old,No,No,No,No,No,No,No,No,No,No,72,Yes,Two year,Yes,Bank transfer (automatic),21.15,1419.40,No
7038,Male,old,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,24,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7039,Female,old,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,72,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,Female,old,Yes,Yes,unknown,DSL,Yes,No,No,No,No,No,11,No,Month-to-month,Yes,Electronic check,29.60,346.45,No


In [30]:
num_columns = df.select_dtypes(include='number').columns

In [31]:
for num_c in num_columns:
    px.histogram(data_frame=df[num_c],x=num_c,title=num_c).show()

In [32]:
df[df['onlinesecurity'] == 'unknowm']['churn'].value_counts()

Series([], Name: count, dtype: int64)

# feature engneering


In [33]:
# firstly i wil extract a col from tenure a column that seperate the clients to categories of of terms so i can measure the customer loyality
df

Unnamed: 0,gender,seniorcitizen,partner,dependents,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,tenure,phoneservice,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,Female,old,Yes,No,unknown,DSL,No,Yes,No,No,No,No,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,old,No,No,No,DSL,Yes,No,Yes,No,No,No,34,Yes,One year,No,Mailed check,56.95,1889.50,No
2,Male,old,No,No,No,DSL,Yes,Yes,No,No,No,No,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,old,No,No,unknown,DSL,Yes,No,Yes,Yes,No,No,45,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,old,No,No,No,Fiber optic,No,No,No,No,No,No,2,Yes,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7037,Female,old,No,No,No,No,No,No,No,No,No,No,72,Yes,Two year,Yes,Bank transfer (automatic),21.15,1419.40,No
7038,Male,old,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,24,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7039,Female,old,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,72,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,Female,old,Yes,Yes,unknown,DSL,Yes,No,No,No,No,No,11,No,Month-to-month,Yes,Electronic check,29.60,346.45,No


In [34]:
def cat_tenure(x):
    if x < 12:
        return 'New'
    elif 12 <= x < 24:
        return 'Somewhat Loyal'
    elif 24 <= x <= 48:
        return 'Loyal'
    else:
        return 'Very Loyal'


In [35]:
df['cust-loyality'] = df['tenure'].apply(cat_tenure)

In [36]:
#secondly i will create a col to present the family from partner and dependents col

In [37]:
def family_size(x):
    if x['partner'] == 'No' and x['dependents'] == 'No':
        return 'Single'
    elif x['partner'] == 'Yes' and x['dependents'] == 'Yes':
        return 'Married With Dependents '
    elif x['partner'] == 'No' and x['dependents'] == 'Yes':
        return 'Singe With Dependents'
    elif x['partner'] == 'Yes' and x['dependents'] == 'No':
        return 'Married'


In [38]:
df['family_member'] = df.apply(family_size,axis=1)

In [39]:
df.head(20)

Unnamed: 0,gender,seniorcitizen,partner,dependents,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,tenure,phoneservice,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn,cust-loyality,family_member
0,Female,old,Yes,No,unknown,DSL,No,Yes,No,No,No,No,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,New,Married
1,Male,old,No,No,No,DSL,Yes,No,Yes,No,No,No,34,Yes,One year,No,Mailed check,56.95,1889.5,No,Loyal,Single
2,Male,old,No,No,No,DSL,Yes,Yes,No,No,No,No,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,New,Single
3,Male,old,No,No,unknown,DSL,Yes,No,Yes,Yes,No,No,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,Loyal,Single
4,Female,old,No,No,No,Fiber optic,No,No,No,No,No,No,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,New,Single
5,Female,old,No,No,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,8,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,New,Single
6,Male,old,No,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,22,Yes,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No,Somewhat Loyal,Singe With Dependents
7,Female,old,No,No,unknown,DSL,Yes,No,No,No,No,No,10,No,Month-to-month,No,Mailed check,29.75,301.9,No,New,Single
8,Female,old,Yes,No,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,28,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,Loyal,Married
9,Male,old,No,Yes,No,DSL,Yes,Yes,No,No,No,No,62,Yes,One year,No,Bank transfer (automatic),56.15,3487.95,No,Very Loyal,Singe With Dependents


In [40]:
# i will create a new col that will represent the the subscriptions summation 
# that will help me to kmow the customer subscriptions 

In [41]:
def subscription_count(x):
    if x['onlinesecurity'] == 'unknown':
        return 0
    
    cols = ['onlinesecurity', 'onlinebackup', 'deviceprotection',
            'techsupport', 'streamingtv', 'streamingmovies']
    
    count = 0
    for col in cols:
        if x[col] == 'Yes':
            count += 1
    
    if count == 0:
        return 0
    
    return count

df['subscription_count'] = df.apply(subscription_count, axis=1)



In [42]:
df['subscription_count'].isna().mean() * 100

0.0

# machine learning

In [43]:
df

Unnamed: 0,gender,seniorcitizen,partner,dependents,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,tenure,phoneservice,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn,cust-loyality,family_member,subscription_count
0,Female,old,Yes,No,unknown,DSL,No,Yes,No,No,No,No,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,New,Married,1
1,Male,old,No,No,No,DSL,Yes,No,Yes,No,No,No,34,Yes,One year,No,Mailed check,56.95,1889.50,No,Loyal,Single,2
2,Male,old,No,No,No,DSL,Yes,Yes,No,No,No,No,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,New,Single,2
3,Male,old,No,No,unknown,DSL,Yes,No,Yes,Yes,No,No,45,No,One year,No,Bank transfer (automatic),42.30,1840.75,No,Loyal,Single,3
4,Female,old,No,No,No,Fiber optic,No,No,No,No,No,No,2,Yes,Month-to-month,Yes,Electronic check,70.70,151.65,Yes,New,Single,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7037,Female,old,No,No,No,No,No,No,No,No,No,No,72,Yes,Two year,Yes,Bank transfer (automatic),21.15,1419.40,No,Very Loyal,Single,0
7038,Male,old,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,24,Yes,One year,Yes,Mailed check,84.80,1990.50,No,Loyal,Married With Dependents,5
7039,Female,old,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,72,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No,Very Loyal,Married With Dependents,4
7040,Female,old,Yes,Yes,unknown,DSL,Yes,No,No,No,No,No,11,No,Month-to-month,Yes,Electronic check,29.60,346.45,No,New,Married With Dependents,1


In [44]:
# firstly i have to drop the unnecessary columns before i start ml 
# i found that the family member column gives me the same data of partner and dependents so i will drop both
df = df.drop(['partner','dependents'],axis=1)


In [45]:
#second i will split the data into train and test
x = df.drop('churn', axis=1)
y = df['churn']

In [46]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 42)

In [47]:
# second o found that the data have no missing values so will not impute for numerical
df.isna().sum()

gender                0
seniorcitizen         0
multiplelines         0
internetservice       0
onlinesecurity        0
onlinebackup          0
deviceprotection      0
techsupport           0
streamingtv           0
streamingmovies       0
tenure                0
phoneservice          0
contract              0
paperlessbilling      0
paymentmethod         0
monthlycharges        0
totalcharges          0
churn                 0
cust-loyality         0
family_member         0
subscription_count    0
dtype: int64

# handeling numerical 

In [48]:
#  found that the data have no missing values so will not impute for numerical
# i will scale with robust scaler because there is a lot of outliers in the data 
# first i wil take the num cols in var
num_columns = x_train.select_dtypes(include=['number','float']).columns


In [49]:
num_columns

Index(['tenure', 'monthlycharges', 'totalcharges', 'subscription_count'], dtype='object')

In [50]:
# i will do feature scalling using Robust scaller 
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()

x_train[num_columns] = robust_scaler.fit_transform(x_train[num_columns])
x_test[num_columns] = robust_scaler.transform(x_test[num_columns])

# handeling categorical 

In [51]:
# firstly i have to divide the categorical columns into nominal columns and ordinal columns
# the ordinal columns is (seniorcitizen	,contract,cust-loyality)
# the nominal columns is (gender,multiplelines,
# internetservice,onlinesecurity,onlinebackup,deviceprotection,
# techsupport,streamingtv,streamingmovies,phoneservice,paperlessbilling,
# paymentmethod,family_member)


In [52]:
# firstly i will handle ordinal coloumns with ordinal encoder

In [53]:
# ordinal encoder for seniorcitizen
cat_columns = x_train.select_dtypes(include=['object']).columns
from sklearn.preprocessing import OrdinalEncoder

ord_encoder = OrdinalEncoder(categories= [['youth','old']])

x_train[['seniorcitizen']] = ord_encoder.fit_transform(x_train[['seniorcitizen']])
x_test[['seniorcitizen']] = ord_encoder.transform(x_test[['seniorcitizen']])


In [54]:
# ordinal encoder for contract
from sklearn.preprocessing import OrdinalEncoder

ord_encoder = OrdinalEncoder(categories= [['Month-to-month','One year','Two year']])

x_train[['contract']] = ord_encoder.fit_transform(x_train[['contract']])
x_test[['contract']] = ord_encoder.transform(x_test[['contract']])



In [55]:
# ordinal encoder for cust-loyality
from sklearn.preprocessing import OrdinalEncoder

ord_encoder = OrdinalEncoder(categories= [['New','Somewhat Loyal','Loyal','Very Loyal']])

x_train[['cust-loyality']] = ord_encoder.fit_transform(x_train[['cust-loyality']])
x_test[['cust-loyality']] = ord_encoder.transform(x_test[['cust-loyality']])


In [None]:
# firstly we will look into the count of unique values of the nominal cols to Decide  i will use one hot encoder or binary 
nominal_cols = x_train.select_dtypes(include='object').columns
nominal_cols

Index(['gender', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'phoneservice', 'paperlessbilling', 'paymentmethod',
       'family_member'],
      dtype='object')

In [None]:
# i will use one hot encoder for all nominal cols because they all have less than 7 unique values
for col in nominal_cols:
    
    print(col)
    print(x_train[col].nunique())
    print('-' * 50)

gender
2
--------------------------------------------------
multiplelines
3
--------------------------------------------------
internetservice
3
--------------------------------------------------
onlinesecurity
2
--------------------------------------------------
onlinebackup
2
--------------------------------------------------
deviceprotection
2
--------------------------------------------------
techsupport
2
--------------------------------------------------
streamingtv
2
--------------------------------------------------
streamingmovies
2
--------------------------------------------------
phoneservice
2
--------------------------------------------------
paperlessbilling
2
--------------------------------------------------
paymentmethod
4
--------------------------------------------------
family_member
4
--------------------------------------------------


In [None]:
# the cols that i will apply ohe on them
ohe_cols = x_train.select_dtypes(include='object').columns
ohe_cols

Index(['gender', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'phoneservice', 'paperlessbilling', 'paymentmethod',
       'family_member'],
      dtype='object')

In [62]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop= 'first', sparse_output= False)

ohe_arr_train = ohe.fit_transform(x_train[ohe_cols])
ohe_arr_test = ohe.transform(x_test[ohe_cols])

In [63]:
ohe_df_train = pd.DataFrame(ohe_arr_train, columns = ohe.get_feature_names_out())
ohe_df_test = pd.DataFrame(ohe_arr_test, columns = ohe.get_feature_names_out())

In [64]:
x_train.reset_index(drop= True, inplace= True)
x_test.reset_index(drop= True, inplace= True)

y_train.reset_index(drop= True, inplace= True)
y_test.reset_index(drop= True, inplace= True)

In [65]:
x_train = pd.concat([x_train, ohe_df_train], axis= 1).drop(columns= ohe_cols, axis= 1)
x_test = pd.concat([x_test, ohe_df_test], axis= 1).drop(columns= ohe_cols, axis= 1)

In [None]:
# then we have to handle impalnce because my data is classification because the target col is cat
#  first we have to see the count values for the target col

In [None]:
# so i have impalance because there is a big diffrent in the percentage 
df['churn'].value_counts(normalize=True) * 100

churn
No     73.459245
Yes    26.540755
Name: proportion, dtype: float64

In [68]:
# i will use smote to handle the impalance
from imblearn.over_sampling import SMOTE

smote = SMOTE()

x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
y_train_smote.value_counts()

churn
No     4137
Yes    4137
Name: count, dtype: int64