In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
# ALGORITHMS:
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVR
from sklearn.svm import NuSVC
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import LogisticRegression


In [2]:
dataX = pd.read_csv("Telco-Customer-Churn.csv")
dataX.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [3]:
dataX.replace(" ", np.nan, inplace=True)
dataX.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [4]:
dataX["Churn"]=dataX["Churn"].replace({'No': 0, 'Yes': 1})

In [5]:
dataX[['TotalCharges']]=dataX[["TotalCharges"]].astype(float)

In [6]:
dataX["TotalCharges"]=dataX["TotalCharges"].fillna(dataX["TotalCharges"].mean())

In [7]:
dataX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null float64
Churn               7043 non-null int64
dtypes: float64(2), int64(3), obj

In [8]:
# Separate target from predictors
y = dataX.Churn
X = dataX.drop(['Churn'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns!
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [9]:
# Choosing CATEGORICAL variables only!
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [10]:
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = XGBClassifier(n_estimators=500)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

### ONE-HOT ENCODER

In [11]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

print("Mean Absolute Error from One-Hot Encoding:") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

Mean Absolute Error from One-Hot Encoding:
0.21646557842441447


### LABEL ENCODER

In [12]:
# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

print("Mean Absolute Error from Label Encoding:") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))


Mean Absolute Error from Label Encoding:
0.21788502484031227


###  Model evaluation using LABEL ENCODER

In [13]:
my_model1 = XGBRegressor()
my_model1.fit(label_X_train, y_train)

my_model2 = XGBClassifier()
my_model2.fit(label_X_train, y_train)

my_model3 = RandomForestRegressor()
my_model3.fit(label_X_train, y_train)

my_model4 = RandomForestClassifier()
my_model4.fit(label_X_train, y_train)

my_model5 = GradientBoostingRegressor()
my_model5.fit(label_X_train, y_train)

my_model6 = GradientBoostingClassifier()
my_model6.fit(label_X_train, y_train)

my_model7 = AdaBoostRegressor()
my_model7.fit(label_X_train, y_train)

my_model8 = AdaBoostClassifier()
my_model8.fit(label_X_train, y_train)

my_model9 = BaggingRegressor()
my_model9.fit(label_X_train, y_train)

my_model10 = BaggingClassifier()
my_model10.fit(label_X_train, y_train)

my_model11 = ExtraTreesRegressor()
my_model11.fit(label_X_train, y_train)

my_model12 = ExtraTreesClassifier()
my_model12.fit(label_X_train, y_train)

my_model13 = SVR()
my_model13.fit(label_X_train, y_train)

my_model14 = SVC()
my_model14.fit(label_X_train, y_train)

my_model15 = LinearSVR()
my_model15.fit(label_X_train, y_train)

my_model16 = LinearSVC()
my_model16.fit(label_X_train, y_train)

my_model17 = NuSVR()
my_model17.fit(label_X_train, y_train)

my_model18 = NuSVC()
my_model18.fit(label_X_train, y_train)

my_model19 = MLPRegressor()
my_model19.fit(label_X_train, y_train)

my_model20 = MLPClassifier()
my_model20.fit(label_X_train, y_train)

my_model21 = KNeighborsRegressor()
my_model21.fit(label_X_train, y_train)

my_model22 = KNeighborsClassifier()
my_model22.fit(label_X_train, y_train)

my_model23 = GaussianNB()
my_model23.fit(label_X_train, y_train)

my_model24 = MultinomialNB()
my_model24.fit(label_X_train, y_train)

my_model25 = LinearRegression()
my_model25.fit(label_X_train, y_train)

my_model26 = LogisticRegression()
my_model26.fit(label_X_train, y_train)


predictions1 = my_model1.predict(label_X_valid)
predictions2 = my_model2.predict(label_X_valid)
predictions3 = my_model3.predict(label_X_valid)
predictions4 = my_model4.predict(label_X_valid)
predictions5 = my_model5.predict(label_X_valid)
predictions6 = my_model6.predict(label_X_valid)
predictions7 = my_model7.predict(label_X_valid)
predictions8 = my_model8.predict(label_X_valid)
predictions9 = my_model9.predict(label_X_valid)
predictions10 = my_model10.predict(label_X_valid)
predictions11 = my_model11.predict(label_X_valid)
predictions12 = my_model12.predict(label_X_valid)
predictions13 = my_model13.predict(label_X_valid)
predictions14 = my_model14.predict(label_X_valid)
predictions15 = my_model15.predict(label_X_valid)
predictions16 = my_model16.predict(label_X_valid)
predictions17 = my_model17.predict(label_X_valid)
predictions18 = my_model18.predict(label_X_valid)
predictions19 = my_model19.predict(label_X_valid)
predictions20 = my_model20.predict(label_X_valid)
predictions21 = my_model21.predict(label_X_valid)
predictions22 = my_model22.predict(label_X_valid)
predictions23 = my_model23.predict(label_X_valid)
predictions24 = my_model24.predict(label_X_valid)
predictions25 = my_model25.predict(label_X_valid)
predictions26 = my_model26.predict(label_X_valid)


print("RESULTS OF MODEL EVALUATION BASED ON LABEL ENCODER!")
print("Mean Absolute Error for XGBRegressor: " + str(mean_absolute_error(predictions1, y_valid)))
print("Mean Absolute Error for XGBClassifier: " + str(mean_absolute_error(predictions2, y_valid)))
print("Mean Absolute Error for RandomForestRegressor: " + str(mean_absolute_error(predictions3, y_valid)))
print("Mean Absolute Error for RandomForestClassifier: " + str(mean_absolute_error(predictions4, y_valid)))
print("Mean Absolute Error for GradientBoostingRegressor: " + str(mean_absolute_error(predictions5, y_valid)))
print("Mean Absolute Error for GradientBoostingClassifier: " + str(mean_absolute_error(predictions6, y_valid)))
print("Mean Absolute Error for AdaBoostRegressor: " + str(mean_absolute_error(predictions7, y_valid)))
print("Mean Absolute Error for AdaBoostClassifier: " + str(mean_absolute_error(predictions8, y_valid)))
print("Mean Absolute Error for BaggingRegressor: " + str(mean_absolute_error(predictions9, y_valid)))
print("Mean Absolute Error for BaggingClassifier: " + str(mean_absolute_error(predictions10, y_valid)))
print("Mean Absolute Error for ExtraTreesRegressor: " + str(mean_absolute_error(predictions11, y_valid)))
print("Mean Absolute Error for ExtraTreesClassifier: " + str(mean_absolute_error(predictions12, y_valid)))
print("Mean Absolute Error for Support Vector Regression: " + str(mean_absolute_error(predictions13, y_valid)))
print("Mean Absolute Error for Support Vector Classification: " + str(mean_absolute_error(predictions14, y_valid)))
print("Mean Absolute Error for Linear Support Vector Regression: " + str(mean_absolute_error(predictions15, y_valid)))
print("Mean Absolute Error for Linear Support Vector Classification: " + str(mean_absolute_error(predictions16, y_valid)))
print("Mean Absolute Error for Nu-Support Vector Regression: " + str(mean_absolute_error(predictions17, y_valid)))
print("Mean Absolute Error for Nu-Support Vector Classification: " + str(mean_absolute_error(predictions18, y_valid)))
print("Mean Absolute Error for (Neural Net):Multi-layer Perceptron Regressor: " + str(mean_absolute_error(predictions19, y_valid)))
print("Mean Absolute Error for (Neural Net):Multi-layer Perceptron Classifier: " + str(mean_absolute_error(predictions20, y_valid)))
print("Mean Absolute Error for KNeighborsRegressor: " + str(mean_absolute_error(predictions21, y_valid)))
print("Mean Absolute Error for KNeighborsClassifier: " + str(mean_absolute_error(predictions22, y_valid)))
print("Mean Absolute Error for GaussianNB: " + str(mean_absolute_error(predictions23, y_valid)))
print("Mean Absolute Error for MultinomialNB: " + str(mean_absolute_error(predictions24, y_valid)))
print("Mean Absolute Error for LinearRegression: " + str(mean_absolute_error(predictions25, y_valid)))
print("Mean Absolute Error for LogisticRegression: " + str(mean_absolute_error(predictions26, y_valid)))


  if getattr(data, 'base', None) is not None and \






RESULTS OF MODEL EVALUATION BASED ON LABEL ENCODER!
Mean Absolute Error for XGBRegressor: 0.2741667470075798
Mean Absolute Error for XGBClassifier: 0.2036905606813343
Mean Absolute Error for RandomForestRegressor: 0.2735344215755855
Mean Absolute Error for RandomForestClassifier: 0.23562810503903478
Mean Absolute Error for GradientBoostingRegressor: 0.27421928626229963
Mean Absolute Error for GradientBoostingClassifier: 0.20723917672107878
Mean Absolute Error for AdaBoostRegressor: 0.31213342723355486
Mean Absolute Error for AdaBoostClassifier: 0.21433640880056778
Mean Absolute Error for BaggingRegressor: 0.2749940856399337
Mean Absolute Error for BaggingClassifier: 0.23562810503903478
Mean Absolute Error for ExtraTreesRegressor: 0.2831795599716111
Mean Absolute Error for ExtraTreesClassifier: 0.2313697657913414
Mean Absolute Error for Support Vector Regression: 0.34992531989118286
Mean Absolute Error for Support Vector Classification: 0.23491838183108588
Mean Absolute Error for Linear

### Model evaluation using ONE-HOT ENCODER

In [14]:
my_model1 = XGBRegressor()
my_model1.fit(OH_X_train, y_train)

my_model2 = XGBClassifier()
my_model2.fit(OH_X_train, y_train)

my_model3 = RandomForestRegressor()
my_model3.fit(OH_X_train, y_train)

my_model4 = RandomForestClassifier()
my_model4.fit(OH_X_train, y_train)

my_model5 = GradientBoostingRegressor()
my_model5.fit(OH_X_train, y_train)

my_model6 = GradientBoostingClassifier()
my_model6.fit(OH_X_train, y_train)

my_model7 = AdaBoostRegressor()
my_model7.fit(OH_X_train, y_train)

my_model8 = AdaBoostClassifier()
my_model8.fit(OH_X_train, y_train)

my_model9 = BaggingRegressor()
my_model9.fit(OH_X_train, y_train)

my_model10 = BaggingClassifier()
my_model10.fit(OH_X_train, y_train)

my_model11 = ExtraTreesRegressor()
my_model11.fit(OH_X_train, y_train)

my_model12 = ExtraTreesClassifier()
my_model12.fit(OH_X_train, y_train)

my_model13 = SVR()
my_model13.fit(OH_X_train, y_train)

my_model14 = SVC()
my_model14.fit(OH_X_train, y_train)

my_model15 = LinearSVR()
my_model15.fit(OH_X_train, y_train)

my_model16 = LinearSVC()
my_model16.fit(OH_X_train, y_train)

my_model17 = NuSVR()
my_model17.fit(OH_X_train, y_train)

my_model18 = NuSVC()
my_model18.fit(OH_X_train, y_train)

my_model19 = MLPRegressor()
my_model19.fit(OH_X_train, y_train)

my_model20 = MLPClassifier()
my_model20.fit(OH_X_train, y_train)

my_model21 = KNeighborsRegressor()
my_model21.fit(OH_X_train, y_train)

my_model22 = KNeighborsClassifier()
my_model22.fit(OH_X_train, y_train)

my_model23 = GaussianNB()
my_model23.fit(OH_X_train, y_train)

my_model24 = MultinomialNB()
my_model24.fit(OH_X_train, y_train)

my_model25 = LinearRegression()
my_model25.fit(OH_X_train, y_train)

my_model26 = LogisticRegression()
my_model26.fit(OH_X_train, y_train)


predictions1 = my_model1.predict(OH_X_valid)
predictions2 = my_model2.predict(OH_X_valid)
predictions3 = my_model3.predict(OH_X_valid)
predictions4 = my_model4.predict(OH_X_valid)
predictions5 = my_model5.predict(OH_X_valid)
predictions6 = my_model6.predict(OH_X_valid)
predictions7 = my_model7.predict(OH_X_valid)
predictions8 = my_model8.predict(OH_X_valid)
predictions9 = my_model9.predict(OH_X_valid)
predictions10 = my_model10.predict(OH_X_valid)
predictions11 = my_model11.predict(OH_X_valid)
predictions12 = my_model12.predict(OH_X_valid)
predictions13 = my_model13.predict(OH_X_valid)
predictions14 = my_model14.predict(OH_X_valid)
predictions15 = my_model15.predict(OH_X_valid)
predictions16 = my_model16.predict(OH_X_valid)
predictions17 = my_model17.predict(OH_X_valid)
predictions18 = my_model18.predict(OH_X_valid)
predictions19 = my_model19.predict(OH_X_valid)
predictions20 = my_model20.predict(OH_X_valid)
predictions21 = my_model21.predict(OH_X_valid)
predictions22 = my_model22.predict(OH_X_valid)
predictions23 = my_model23.predict(OH_X_valid)
predictions24 = my_model24.predict(OH_X_valid)
predictions25 = my_model25.predict(OH_X_valid)
predictions26 = my_model26.predict(OH_X_valid)


print("RESULTS OF MODEL EVALUATION BASED ON ONE-HOT ENCODER!")
print("Mean Absolute Error for XGBRegressor: " + str(mean_absolute_error(predictions1, y_valid)))
print("Mean Absolute Error for XGBClassifier: " + str(mean_absolute_error(predictions2, y_valid)))
print("Mean Absolute Error for RandomForestRegressor: " + str(mean_absolute_error(predictions3, y_valid)))
print("Mean Absolute Error for RandomForestClassifier: " + str(mean_absolute_error(predictions4, y_valid)))
print("Mean Absolute Error for GradientBoostingRegressor: " + str(mean_absolute_error(predictions5, y_valid)))
print("Mean Absolute Error for GradientBoostingClassifier: " + str(mean_absolute_error(predictions6, y_valid)))
print("Mean Absolute Error for AdaBoostRegressor: " + str(mean_absolute_error(predictions7, y_valid)))
print("Mean Absolute Error for AdaBoostClassifier: " + str(mean_absolute_error(predictions8, y_valid)))
print("Mean Absolute Error for BaggingRegressor: " + str(mean_absolute_error(predictions9, y_valid)))
print("Mean Absolute Error for BaggingClassifier: " + str(mean_absolute_error(predictions10, y_valid)))
print("Mean Absolute Error for ExtraTreesRegressor: " + str(mean_absolute_error(predictions11, y_valid)))
print("Mean Absolute Error for ExtraTreesClassifier: " + str(mean_absolute_error(predictions12, y_valid)))
print("Mean Absolute Error for Support Vector Regression: " + str(mean_absolute_error(predictions13, y_valid)))
print("Mean Absolute Error for Support Vector Classification: " + str(mean_absolute_error(predictions14, y_valid)))
print("Mean Absolute Error for Linear Support Vector Regression: " + str(mean_absolute_error(predictions15, y_valid)))
print("Mean Absolute Error for Linear Support Vector Classification: " + str(mean_absolute_error(predictions16, y_valid)))
print("Mean Absolute Error for Nu-Support Vector Regression: " + str(mean_absolute_error(predictions17, y_valid)))
print("Mean Absolute Error for Nu-Support Vector Classification: " + str(mean_absolute_error(predictions18, y_valid)))
print("Mean Absolute Error for (Neural Net):Multi-layer Perceptron Regressor: " + str(mean_absolute_error(predictions19, y_valid)))
print("Mean Absolute Error for (Neural Net):Multi-layer Perceptron Classifier: " + str(mean_absolute_error(predictions20, y_valid)))
print("Mean Absolute Error for KNeighborsRegressor: " + str(mean_absolute_error(predictions21, y_valid)))
print("Mean Absolute Error for KNeighborsClassifier: " + str(mean_absolute_error(predictions22, y_valid)))
print("Mean Absolute Error for GaussianNB: " + str(mean_absolute_error(predictions23, y_valid)))
print("Mean Absolute Error for MultinomialNB: " + str(mean_absolute_error(predictions24, y_valid)))
print("Mean Absolute Error for LinearRegression: " + str(mean_absolute_error(predictions25, y_valid)))
print("Mean Absolute Error for LogisticRegression: " + str(mean_absolute_error(predictions26, y_valid)))




  if getattr(data, 'base', None) is not None and \


RESULTS OF MODEL EVALUATION BASED ON ONE-HOT ENCODER!
Mean Absolute Error for XGBRegressor: 0.2746184348129397
Mean Absolute Error for XGBClassifier: 0.21291696238466998
Mean Absolute Error for RandomForestRegressor: 0.266721923687857
Mean Absolute Error for RandomForestClassifier: 0.2207239176721079
Mean Absolute Error for GradientBoostingRegressor: 0.2740561621075471
Mean Absolute Error for GradientBoostingClassifier: 0.2150461320085167
Mean Absolute Error for AdaBoostRegressor: 0.3254201732855699
Mean Absolute Error for AdaBoostClassifier: 0.20865862313697658
Mean Absolute Error for BaggingRegressor: 0.27622917300348104
Mean Absolute Error for BaggingClassifier: 0.22569198012775019
Mean Absolute Error for ExtraTreesRegressor: 0.2727111426543648
Mean Absolute Error for ExtraTreesClassifier: 0.2512420156139106
Mean Absolute Error for Support Vector Regression: 0.3321192616471008
Mean Absolute Error for Support Vector Classification: 0.23562810503903478
Mean Absolute Error for Linear S