* ROC-AUC Scope
  > LogReg with scaler = 0.83222

  > NN with RElu and 2 Linear layer and scaler = 0.83159

In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [82]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
df_sub = pd.read_csv('./data/submission.csv')
df_train['TotalSpent'] = pd.to_numeric(df_train['TotalSpent'], errors='coerce')
df_train['TotalSpent'] = df_train['TotalSpent'].fillna(df_train['TotalSpent'].mean())

#### Preparation df_train

In [83]:
df_train['TotalSpent'] = pd.to_numeric(df_train['TotalSpent'], errors='coerce')
df_train['TotalSpent'] = df_train['TotalSpent'].fillna(df_train['TotalSpent'].mean())

encoded_Sex = pd.get_dummies(df_train[['Sex']], drop_first=True)
encoded_HasPartner = pd.get_dummies(df_train[['HasPartner']], drop_first=True)
encoded_HasChild = pd.get_dummies(df_train[['HasChild']], drop_first=True)
encoded_HasPhoneService = pd.get_dummies(df_train[['HasPhoneService']], drop_first=True)

encoded_HasMultiplePhoneNumbers = pd.get_dummies(df_train[['HasMultiplePhoneNumbers']], drop_first=True)
encoded_HasMultiplePhoneNumbers = encoded_HasMultiplePhoneNumbers.drop('HasMultiplePhoneNumbers_No phone service', axis=1)

encoded_HasInternetService = pd.get_dummies(df_train[['HasInternetService']], drop_first=True)
encoded_HasInternetService = encoded_HasInternetService.drop('HasInternetService_No', axis=1)

encoded_HasOnlineBackup = pd.get_dummies(df_train[['HasOnlineBackup']], drop_first=True)
encoded_HasOnlineBackup = encoded_HasOnlineBackup.drop('HasOnlineBackup_No internet service', axis=1)

encoded_HasDeviceProtection = pd.get_dummies(df_train[['HasDeviceProtection']], drop_first=True)
encoded_HasDeviceProtection = encoded_HasDeviceProtection.drop('HasDeviceProtection_No internet service', axis=1)

encoded_HasTechSupportAccess = pd.get_dummies(df_train[['HasTechSupportAccess']], drop_first=True)
encoded_HasTechSupportAccess = encoded_HasTechSupportAccess.drop('HasTechSupportAccess_No internet service', axis=1)

encoded_HasOnlineTV = pd.get_dummies(df_train[['HasOnlineTV']], drop_first=True)
encoded_HasOnlineTV = encoded_HasOnlineTV.drop('HasOnlineTV_No internet service', axis=1)

encoded_HasMovieSubscription = pd.get_dummies(df_train[['HasMovieSubscription']], drop_first=True)
encoded_HasMovieSubscription = encoded_HasMovieSubscription.drop('HasMovieSubscription_No internet service', axis=1)

encoded_HasContractPhone = pd.get_dummies(df_train[['HasContractPhone']], drop_first=True)
encoded_IsBillingPaperless = pd.get_dummies(df_train[['IsBillingPaperless']], drop_first=True)
encoded_PaymentMethod = pd.get_dummies(df_train[['PaymentMethod']], drop_first=True)

scaler = StandardScaler()
df_train['ClientPeriod'] = scaler.fit_transform(df_train['ClientPeriod'].values.reshape(-1, 1))
df_train['MonthlySpending'] = scaler.fit_transform(df_train['MonthlySpending'].values.reshape(-1, 1))
df_train['TotalSpent'] = scaler.fit_transform(df_train['TotalSpent'].values.reshape(-1, 1))

X = df_train[['ClientPeriod',	'MonthlySpending',	'TotalSpent']]
y = df_train['Churn']
X = pd.concat([X, encoded_Sex, encoded_HasPartner, encoded_HasChild, encoded_HasPhoneService, encoded_HasMultiplePhoneNumbers, encoded_HasInternetService, encoded_HasOnlineBackup,
              encoded_HasDeviceProtection, encoded_HasTechSupportAccess, encoded_HasOnlineTV, encoded_HasMovieSubscription, encoded_HasContractPhone, encoded_IsBillingPaperless, encoded_PaymentMethod], axis=1)
X[['Sex_Male',
       'HasPartner_Yes', 'HasChild_Yes', 'HasPhoneService_Yes',
       'HasMultiplePhoneNumbers_Yes', 'HasInternetService_Fiber optic',
       'HasOnlineBackup_Yes', 'HasDeviceProtection_Yes',
       'HasTechSupportAccess_Yes', 'HasOnlineTV_Yes',
       'HasMovieSubscription_Yes', 'HasContractPhone_One year',
       'HasContractPhone_Two year', 'IsBillingPaperless_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']] = X[['Sex_Male',
       'HasPartner_Yes', 'HasChild_Yes', 'HasPhoneService_Yes',
       'HasMultiplePhoneNumbers_Yes', 'HasInternetService_Fiber optic',
       'HasOnlineBackup_Yes', 'HasDeviceProtection_Yes',
       'HasTechSupportAccess_Yes', 'HasOnlineTV_Yes',
       'HasMovieSubscription_Yes', 'HasContractPhone_One year',
       'HasContractPhone_Two year', 'IsBillingPaperless_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']].astype(int)

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#### LogReg

In [86]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print(f'ROC-AUC = {roc_auc_score(y_test, y_pred_proba)}')

ROC-AUC = 0.8322246858832224


#### Neural Network

In [89]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [172]:
dataset = TensorDataset(X_train_tensor, y_train_tensor)
batch_size = 50
train_loader = DataLoader(dataset, batch_size, shuffle=True)

In [241]:
class Clf_NN(nn.Module):
    def __init__(self):
        super(Clf_NN, self).__init__()
        self.fc1 = nn.Linear(in_features=20, out_features=14)
        self.fc2 = nn.Linear(in_features=14, out_features=6)
        self.fc3 = nn.Linear(in_features=6, out_features=2)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.tanh(x)
        x = self.fc2(x)
        x = F.tanh(x)
        x = self.fc3(x)
        return x

model = Clf_NN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

In [245]:
model.train()
history_loss = []
num_epoch = 5
for epoch in range(num_epoch):
    for x, y in train_loader:
        optimizer.zero_grad()
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
    scheduler.step()
    _loss = loss.detach().numpy().tolist()
    history_loss.append(_loss)
    # print(f'Epoch: {epoch+1},  loss = {round(_loss, 5)}')
    print(f'Epoch: {epoch+1},  lr = {round(scheduler.get_last_lr()[0], 5)},  loss = {round(_loss, 5)}')

Epoch: 1,  lr = 0.00019,  loss = 0.48977
Epoch: 2,  lr = 0.00017,  loss = 0.41323
Epoch: 3,  lr = 0.00015,  loss = 0.35298
Epoch: 4,  lr = 0.00014,  loss = 0.32939
Epoch: 5,  lr = 0.00012,  loss = 0.46675


In [246]:
with torch.no_grad():
    model.eval()
    y_pred = model(X_test_tensor)
    probabilities = F.softmax(y_pred, dim=1)
#     _, predicted = torch.max(probabilities, 1)
# predicted_classes = predicted.numpy()
print(f'ROC-AUC = {roc_auc_score(y_test, probabilities[:, 1].numpy())}')

ROC-AUC = 0.8313390422305477
