In [70]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [71]:
# leggo il dataset
dati = pd.read_csv('churn.csv')

In [72]:
# prima vista generale sui dati
dati.head()

Unnamed: 0,Churn,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,No,4223-BKEOR,Female,0,No,Yes,21,Yes,No,DSL,...,No,Yes,No,No,Yes,One year,No,Mailed check,64.85,1336.8
1,No,6035-RIIOM,Female,0,No,No,54,Yes,Yes,Fiber optic,...,Yes,No,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),97.2,5129.45
2,Yes,3797-VTIDR,Male,0,Yes,No,1,No,No phone service,DSL,...,No,No,No,No,No,Month-to-month,Yes,Electronic check,23.45,23.45
3,Yes,2568-BRGYX,Male,0,No,No,4,Yes,No,Fiber optic,...,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.2,237.95
4,No,2775-SEFEE,Male,0,No,Yes,0,Yes,Yes,DSL,...,Yes,No,Yes,No,No,Two year,Yes,Bank transfer (automatic),61.9,


In [73]:
# controllo alcune info sulla struttura dei dati
dati.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 21 columns):
Churn               5634 non-null object
customerID          5634 non-null object
gender              5634 non-null object
SeniorCitizen       5634 non-null int64
Partner             5634 non-null object
Dependents          5634 non-null object
tenure              5634 non-null int64
PhoneService        5634 non-null object
MultipleLines       5634 non-null object
InternetService     5634 non-null object
OnlineSecurity      5634 non-null object
OnlineBackup        5634 non-null object
DeviceProtection    5634 non-null object
TechSupport         5634 non-null object
StreamingTV         5634 non-null object
StreamingMovies     5634 non-null object
Contract            5634 non-null object
PaperlessBilling    5634 non-null object
PaymentMethod       5634 non-null object
MonthlyCharges      5634 non-null float64
TotalCharges        5634 non-null object
dtypes: float64(1), int64(2), obj

In [74]:
# si può notare che il dataset contiene 5634 righe e 21 colonne,
# non ci sono valori mancanti e
# ci sono 18 colonne di tipo object saranno quindi necessarie opportune codifiche

In [75]:
# Dal dataset si può inoltre notare la feature TotalCharges ha valori di tipo float, 
# converto quindi il suo tipo in numerico
dati['TotalCharges'] = dati['TotalCharges'].convert_objects(convert_numeric=True)
dati['TotalCharges'].dtype

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  This is separate from the ipykernel package so we can avoid doing imports until


dtype('float64')

In [76]:
# Osservo le statistiche delle features numeriche
dati.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,5634.0,5634.0,5634.0,5624.0
mean,0.160809,32.373092,64.864253,2291.154605
std,0.367388,24.424539,30.089324,2263.151534
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.75,411.5125
50%,0.0,29.0,70.525,1410.8
75%,0.0,55.0,89.9375,3808.85
max,1.0,72.0,118.6,8684.8


In [77]:
# osservo le statistiche delle features di tipo object
dati.describe(include=['object'])

Unnamed: 0,Churn,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
count,5634,5634,5634,5634,5634,5634,5634,5634,5634,5634,5634,5634,5634,5634,5634,5634,5634
unique,2,5634,2,2,2,2,3,3,3,3,3,3,3,3,3,2,4
top,No,5614-DNZCE,Male,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check
freq,4138,1,2887,2904,3951,5082,2685,2481,2810,2448,2489,2791,2239,2223,3083,3325,1887


In [78]:
print ("Number of unique values in each column\n")
for col_name in dati.columns:
 print(col_name,": " ,dati[col_name].nunique())

Number of unique values in each column

Churn :  2
customerID :  5634
gender :  2
SeniorCitizen :  2
Partner :  2
Dependents :  2
tenure :  73
PhoneService :  2
MultipleLines :  3
InternetService :  3
OnlineSecurity :  3
OnlineBackup :  3
DeviceProtection :  3
TechSupport :  3
StreamingTV :  3
StreamingMovies :  3
Contract :  3
PaperlessBilling :  2
PaymentMethod :  4
MonthlyCharges :  1495
TotalCharges :  5299


In [79]:
# rimuovo i dati inutili
df = dati.copy()
df = df.drop('customerID', axis=1)

In [80]:
# converto le variabili categoriche di queste tre features in variabili fittizie
df = pd.get_dummies(data = df,columns = ['InternetService', 'Contract', 'PaymentMethod'], drop_first=True)

In [81]:
# codifico le variabili categoriche
df = df.apply(LabelEncoder().fit_transform)
df.head()

Unnamed: 0,Churn,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,...,PaperlessBilling,MonthlyCharges,TotalCharges,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,0,0,1,21,1,0,2,0,...,0,586,2429,0,0,1,0,0,0,1
1,0,0,0,0,0,54,1,2,0,2,...,1,1170,4445,1,0,0,1,0,0,0
2,1,1,0,1,0,1,0,1,0,0,...,1,59,41,0,0,0,0,0,1,0
3,1,1,0,0,0,4,1,0,0,0,...,1,674,803,1,0,0,0,0,1,0
4,0,1,0,0,1,0,1,2,2,2,...,1,546,5304,0,0,0,1,0,0,0


In [82]:
# test e train

In [83]:
y = df['Churn']
# analizzo tutte le features tranne churn
x = df.drop('Churn', axis=1)

In [84]:
# uso metà file come dimensione del test 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=0)

In [85]:
# utilizzo la logistic regression 
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
accuracy = lr.score(x_test, y_test)*100
print("{:.2f}".format(accuracy))

81.33


