In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from category_encoders.target_encoder import TargetEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (GridSearchCV,KFold, train_test_split)
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('df_pre_tratado.csv')

In [5]:
df.shape

(7043, 41)

In [6]:
df.isnull().sum()

Customer_ID                                 0
Gender                                      0
Age                                         0
Married                                     0
Number_Dependents                           0
City                                        0
Zip_Code                                    0
Latitude                                    0
Longtitude                                  0
Number_Referrals                            0
Tenure                                      0
Offer                                    3877
Phone_service                               0
Average_Monthly_Long_Distance_Charges     682
Multiple_Lines                            682
Internet_Service                            0
Internet_Type                            1526
Average_Monthly_GB_Download              1526
Online_Security                          1526
Online_Backup                            1526
Device_Protection_Plan                   1526
Premium_Tech_Support              

In [7]:
df = df[df['Premium_Tech_Support'].notnull()] 

In [8]:
df.isnull().sum()

Customer_ID                                 0
Gender                                      0
Age                                         0
Married                                     0
Number_Dependents                           0
City                                        0
Zip_Code                                    0
Latitude                                    0
Longtitude                                  0
Number_Referrals                            0
Tenure                                      0
Offer                                    3024
Phone_service                               0
Average_Monthly_Long_Distance_Charges     682
Multiple_Lines                            682
Internet_Service                            0
Internet_Type                               0
Average_Monthly_GB_Download                 0
Online_Security                             0
Online_Backup                               0
Device_Protection_Plan                      0
Premium_Tech_Support              

In [10]:
df = df[df['Average_Monthly_Long_Distance_Charges'].notnull()] 

In [11]:
df.columns

Index(['Customer_ID', 'Gender', 'Age', 'Married', 'Number_Dependents', 'City',
       'Zip_Code', 'Latitude', 'Longtitude', 'Number_Referrals', 'Tenure',
       'Offer', 'Phone_service', 'Average_Monthly_Long_Distance_Charges',
       'Multiple_Lines', 'Internet_Service', 'Internet_Type',
       'Average_Monthly_GB_Download', 'Online_Security', 'Online_Backup',
       'Device_Protection_Plan', 'Premium_Tech_Support', 'Streaming_TV',
       'Streaming_Movies', 'Streaming_Music', 'Unlimited_Data', 'Contract',
       'Paperless_Billing', 'Payment_Method', 'Monthly_Charge',
       'Total_Charges', 'Total_Refunds', 'Total_Extra_Data_Charges',
       'Total_Long_Distance_Charges', 'Total_Revenue', 'Customer_Status',
       'Churn_Category', 'Churn_Reason', 'Tenure_bins', 'Monthly_Charge_bins',
       'Age_bins'],
      dtype='object')

In [12]:
columns_to_model = ['Age', 'Number_Dependents', 'Number_Referrals', 'Tenure',
       'Phone_service', 'Average_Monthly_Long_Distance_Charges',
       'Multiple_Lines', 'Internet_Service', 'Internet_Type',
       'Average_Monthly_GB_Download', 'Online_Security', 'Online_Backup',
       'Device_Protection_Plan', 'Premium_Tech_Support', 'Streaming_TV',
       'Streaming_Movies', 'Streaming_Music', 'Unlimited_Data', 'Contract',
       'Paperless_Billing', 'Payment_Method', 'Monthly_Charge',
       'Total_Charges', 'Total_Refunds', 'Total_Extra_Data_Charges',
       'Total_Long_Distance_Charges', 'Total_Revenue']

In [13]:
def split(data, target_variable):
    X_train, X_test, y_train, y_test = train_test_split(
                                                    data[columns_to_model], 
                                                    data[target_variable], 
                                                    stratify=data[target_variable], 
                                                    random_state=42
                                                    )
    return X_train, X_test, y_train, y_test

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df[columns_to_model], df['Customer_Status'], test_size=0.2, random_state=42)

In [15]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [16]:
from sklearn.preprocessing import MinMaxScaler
selected_columns = ['Age', 'Number_Dependents', 'Number_Referrals', 'Tenure',
        'Average_Monthly_Long_Distance_Charges',
       'Average_Monthly_GB_Download', 'Monthly_Charge',
       'Total_Charges', 'Total_Refunds', 'Total_Extra_Data_Charges',
       'Total_Long_Distance_Charges', 'Total_Revenue']
X_train_subset = X_train[selected_columns]
scaler = MinMaxScaler()
X_train[['Age_s', 'Number_Dependents_s', 'Number_Referrals_s', 'Tenure_s',
        'Average_Monthly_Long_Distance_Charges_s',
       'Average_Monthly_GB_Download_s', 'Monthly_Charge_s',
       'Total_Charges_s', 'Total_Refunds_s', 'Total_Extra_Data_Charges_s',
       'Total_Long_Distance_Charges_s', 'Total_Revenue']] = pd.DataFrame(scaler.fit_transform(X_train_subset), columns=selected_columns)
X_train = X_train.drop(['Age', 'Number_Dependents', 'Number_Referrals', 'Tenure',
        'Average_Monthly_Long_Distance_Charges',
       'Average_Monthly_GB_Download', 'Monthly_Charge',
       'Total_Charges', 'Total_Refunds', 'Total_Extra_Data_Charges',
       'Total_Long_Distance_Charges', 'Total_Revenue'], axis=1)

In [17]:
min_max_params = pd.DataFrame({
    'min': scaler.data_min_,
    'max': scaler.data_max_
}, index=selected_columns)
min_max_params.to_csv('min_max_params.csv', index=False)

In [18]:
from sklearn.preprocessing import MinMaxScaler
X_test_subset = X_test[selected_columns]
scaler = MinMaxScaler()
X_test[['Age_s', 'Number_Dependents_s', 'Number_Referrals_s', 'Tenure_s',
        'Average_Monthly_Long_Distance_Charges_s',
       'Average_Monthly_GB_Download_s', 'Monthly_Charge_s',
       'Total_Charges_s', 'Total_Refunds_s', 'Total_Extra_Data_Charges_s',
       'Total_Long_Distance_Charges_s', 'Total_Revenue']] = pd.DataFrame(scaler.fit_transform(X_test_subset), columns=selected_columns)
X_test = X_test.drop(['Age', 'Number_Dependents', 'Number_Referrals', 'Tenure',
        'Average_Monthly_Long_Distance_Charges',
       'Average_Monthly_GB_Download', 'Monthly_Charge',
       'Total_Charges', 'Total_Refunds', 'Total_Extra_Data_Charges',
       'Total_Long_Distance_Charges', 'Total_Revenue'], axis=1)

In [19]:
treino = pd.concat([X_train,pd.Series(list(y_train.values))], axis=1)

In [20]:
treino = treino.rename(columns={0: 'Churn'})

In [21]:
treino.to_csv('dataset_treinamento.csv', index=False)

In [22]:
teste = pd.concat([X_test, pd.Series(list(y_test.values))], axis=1)

In [23]:
teste = teste.rename(columns={0: 'Churn'})

In [24]:
teste.to_csv('dataset_teste.csv', index=False)