# 1. Loading Dependencies

In [None]:
#Data Structures
import pandas as pd
import numpy as np
import re
import os

### For installing missingno library, type this command in terminal
#pip install missingno

import missingno as msno

#Sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score

#Plotting
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

#Others
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [73]:
#COMMENT THIS SECTION INCASE RUNNING THIS NOTEBOOK LOCALLY

#Checking the kaggle paths for the uploaded datasets
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [74]:
#Loading datasets

data = pd.read_csv("../input/ml-lab-i-c39/train.csv")
unseen = pd.read_csv("../input/ml-lab-i-c39/test.csv")
sample = pd.read_csv("../input/ml-lab-i-c39/sample.csv")
data_dict = pd.read_csv("../input/ml-lab-i-c39/data_dictionary.csv")

print(data.shape)
print(unseen.shape)
print(sample.shape)
print(data_dict.shape)

In [75]:
data_dict

In [76]:
#Preparing Dataframe for easy use
ids = ['id','circle_id']
total_amounts = [i for i in list(data.columns) if re.search('total.+amt',i)]
total_outgoing_minutes = [i for i in list(data.columns) if re.search('total.+og.+mou',i)]
offnetwork_minutes = [i for i in list(data.columns) if re.search('offnet',i)]
average_revenue_3g = [i for i in list(data.columns) if re.search('arpu.+3g',i)]
average_revenue_2g = [i for i in list(data.columns) if re.search('arpu.+2g',i)]
volume_3g = [i for i in list(data.columns) if re.search('vol.+3g',i)]
volume_2g = [i for i in list(data.columns) if re.search('vol.+2g',i)]
age_on_network = [i for i in list(data.columns) if re.search('aon',i)]

#Storing them in a single flat list
variables = [*ids, 
             *total_amounts, 
             *total_outgoing_minutes, 
             *offnetwork_minutes, 
             *average_revenue_3g, 
             *average_revenue_2g,
             *volume_3g,
             *volume_2g,
             *age_on_network, 
             'churn_probability']

data = data[variables].set_index('id')

In [77]:
data.head()

In [78]:
# Columns and datatypes
data.info(verbose=1)

In [79]:
data.describe(include="all")

# 2. Create test and train datasets

In [80]:
data['circle_id'].unique()


In [81]:
X = data.drop(['circle_id'],1).iloc[:,:-1]
y = data.iloc[:,-1]

X.shape, y.shape

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [83]:
X_train.head()

# 3. Dealing with missing data

In [84]:
msno.bar(X_train)

In [85]:
msno.matrix(X_train)

In [86]:
missing_data_percent = 100*X_train.isnull().sum()/len(y_train)
missing_data_percent

In [87]:
new_vars = missing_data_percent[missing_data_percent.le(40)].index
new_vars

In [88]:
X_train_filtered = X_train[new_vars]
X_train_filtered.shape

In [89]:
missing_data_percent = X_train_filtered.isnull().any()
impute_cols = missing_data_percent[missing_data_percent.gt(0)].index
impute_cols

In [90]:
missing_data_percent = X_train_filtered.isnull().any()
impute_cols = missing_data_percent[missing_data_percent.gt(0)].index
impute_cols

In [91]:
imp = SimpleImputer(strategy='constant', fill_value=0)
X_train_filtered[impute_cols] = imp.fit_transform(X_train_filtered[impute_cols])

In [92]:
msno.bar(X_train_filtered)

In [93]:
X_train_filtered.describe()

# 4. Exploratory Data Analysis & Preprocessing

In [94]:
plt.figure(figsize=(15,8))
plt.xticks(rotation=45)
sns.boxplot(data = X_train_filtered)

## Dealing with Outliers

In [95]:
def cap_outliers(array, k=3):
    upper_limit = array.mean() + k*array.std()
    lower_limit = array.mean() - k*array.std()
    array[array<lower_limit] = lower_limit
    array[array>upper_limit] = upper_limit
    return array

In [96]:
X_train_filtered1 = X_train_filtered.apply(cap_outliers, axis=0)

plt.figure(figsize=(15,8))
plt.xticks(rotation=45)
sns.boxplot(data = X_train_filtered1)

## Feature Scaling

In [97]:
scale = StandardScaler()
X_train_filtered2 = scale.fit_transform(X_train_filtered1)

In [98]:
plt.figure(figsize=(15,8))
plt.xticks(rotation=45)
sns.boxplot(data = pd.DataFrame(X_train_filtered2, columns=new_vars))

In [99]:
plt.figure(figsize=(10,8))
sns.heatmap(pd.DataFrame(X_train_filtered2, columns=new_vars).corr())

In [100]:
#Distribution for the churn probability
sns.histplot(y_train)

# 5. Feature engineering and selection

In [101]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(X_train_filtered2, y_train)

In [102]:
feature_importances = pd.DataFrame({'col':new_vars, 'importance':rf.feature_importances_})

In [103]:
plt.figure(figsize=(15,8))
plt.xticks(rotation=45)
plt.bar(feature_importances['col'], feature_importances['importance'])

In [104]:
pca = PCA()
pca_components = pca.fit_transform(X_train_filtered2)
sns.scatterplot(x=pca_components[:,0], y=pca_components[:,1], hue=y_train)

In [105]:
sns.scatterplot(x=pca_components[:,1], y=pca_components[:,2], hue=y_train)

In [106]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(pca_components, y_train)

feature_importances = pd.DataFrame({'col':['component_'+str(i) for i in range(16)], 
                                    'importance':rf.feature_importances_})

plt.figure(figsize=(15,8))
plt.xticks(rotation=45)
plt.bar(feature_importances['col'], feature_importances['importance'])

# 6. Model building

In [107]:
lr = LogisticRegression(max_iter=1000, tol=0.001, solver='sag')
lr.fit(pca_components[:,:2], y_train)

In [108]:
lr.score(pca_components[:,:2], y_train)

In [109]:
imp = SimpleImputer(strategy='constant', fill_value=0)
scale = StandardScaler()
pca = PCA(n_components=10)
lr = LogisticRegression(max_iter=1000, tol=0.001)

In [110]:
pipe = Pipeline(steps = [('imputation',imp),
                         ('scaling',scale),
                         ('pca',pca),
                         ('model',lr)])

In [111]:
pipe.fit(X_train[new_vars], y_train)

In [112]:
train_score = pipe.score(X_train[new_vars], y_train)
print("Training accuracy:", train_score)

In [113]:
test_score = pipe.score(X_test[new_vars], y_test)
print("Test accuracy:", test_score)

In [114]:
confusion_matrix(y_train, pipe.predict(X_train[new_vars]))

In [115]:
confusion_matrix(y_test, pipe.predict(X_test[new_vars]))

In [116]:
precision_score(y_test, pipe.predict(X_test[new_vars]))

In [117]:
recall_score(y_test, pipe.predict(X_test[new_vars]))

# 7. Creating submission file

In [118]:
sample.head()

In [119]:
unseen.head()

In [120]:
submission_data = unseen.set_index('id')[new_vars]
submission_data.shape

In [121]:
unseen['churn_probability'] = pipe.predict(submission_data)
output = unseen[['id','churn_probability']]
output.head()

In [122]:
output.to_csv('output.csv',index=False)