In [1]:
import os
import pandas as pd

In [2]:
# Cambiar directorio
os.chdir("..")

In [3]:
# Cargar los datos que tenemos disponibles
data = pd.read_csv("data/raw/churn.csv")

In [4]:
data.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
data.tail(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


In [6]:
# Nos deshacemos de las columnas que no contribuyen en mucho
data = data.drop(data.columns[0:3], axis=1)

In [7]:
# Convertimos los datos en formato categorico, para más info: shorturl.at/y0269
column_equivalence = {}
features = list(data.columns)
for i, column in enumerate(list([str(d) for d in data.dtypes])):
    if column == "object":
        data[data.columns[i]] = data[data.columns[i]].fillna(data[data.columns[i]].mode())
        categorical_column = data[data.columns[i]].astype("category")
        current_column_equivalence = dict(enumerate(categorical_column.cat.categories))
        column_equivalence[i] = dict((v,k) for k,v in current_column_equivalence.items())
        data[data.columns[i]] = categorical_column.cat.codes
    else:
        data[data.columns[i]] = data[data.columns[i]].fillna(data[data.columns[i]].median())

In [8]:
column_equivalence

{1: {'France': 0, 'Germany': 1, 'Spain': 2}, 2: {'Female': 0, 'Male': 1}}

In [9]:
data['Exited'].value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [10]:
data['Geography'].value_counts()

Geography
0    5014
1    2509
2    2477
Name: count, dtype: int64

In [11]:
data['Gender'].value_counts()

Gender
1    5457
0    4543
Name: count, dtype: int64

In [12]:
# Save data prepared 
data.to_csv("data/processed/churn-prepared.csv", index=False)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# Generar los datos para poder separar la variable de respuesta de los datos que tenemos disponibles
X = data.copy()
y = X.pop(data.columns[-1])

In [15]:
from tpot import TPOTClassifier



In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.75, test_size=0.25)

In [17]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)

In [18]:
pipeline_optimizer.fit(X_train, y_train)

                                                                             
Generation 1 - Current best internal CV score: 0.8686666666666667
                                                                             
Generation 2 - Current best internal CV score: 0.8686666666666667
                                                                             
Generation 3 - Current best internal CV score: 0.8688
                                                                              
Generation 4 - Current best internal CV score: 0.8688
                                                                              
Generation 5 - Current best internal CV score: 0.8688
                                                                              
Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=6, max_features=0.5, min_samples_leaf=13, min_samples_split=17, n_estimators=100, subsample=0.5)


In [19]:
print(pipeline_optimizer.score(X_test, y_test))

0.8484


In [25]:
pipeline_optimizer.export('src/models/train_model.py')

In [26]:
# Medir los resultados obtenidos
from sklearn.metrics import confusion_matrix

In [27]:
confusion_matrix(y_test, pipeline_optimizer.predict(X_test))

array([[1896,   89],
       [ 290,  225]])

In [28]:
# Generar el binario del modelo para reutilizarlo, equivalencia de variables categoricas y caracteristicas del modelo
import pickle
pickle.dump(column_equivalence, open("models/column_equivalence.pk", "wb"))
pickle.dump(features, open("models/features.pk", "wb"))