## Task : Predict the churn score for a website based on the features provided in the dataset.

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv(r'F:\Projects\Hackathons\HE\churn_risk_rate\dataset\train.csv')
test = pd.read_csv(r'F:\Projects\Hackathons\HE\churn_risk_rate\dataset\test.csv')
# Dropping unimportant columns

output = pd.DataFrame(columns = ['customer_id','churn_risk_score'])
output['customer_id'] = test['customer_id']
cols_to_drop = ['customer_id', 'Name', 'security_no']
train = train.drop(cols_to_drop, axis = 1)
test = test.drop(cols_to_drop, axis = 1)


### Imputation for Missing Values

In [3]:
from sklearn.impute import SimpleImputer
imputer_gender = SimpleImputer(missing_values = 'Unknown', strategy='most_frequent')
train.loc[:,'gender'] = imputer_gender.fit_transform(train.loc[:,'gender'].to_numpy().reshape(-1,1))

test.loc[:,'gender'] = imputer_gender.transform(test.loc[:,'gender'].to_numpy().reshape(-1,1))

In [4]:
from sklearn.impute import SimpleImputer
imputer_medium_of_operation = SimpleImputer(missing_values = '?', strategy='most_frequent')
train.loc[:,'medium_of_operation'] = imputer_medium_of_operation.fit_transform(train.loc[:,'medium_of_operation'].to_numpy().reshape(-1,1))
test.loc[:,'medium_of_operation'] = imputer_medium_of_operation.transform(test.loc[:,'medium_of_operation'].to_numpy().reshape(-1,1))

In [5]:
train. avg_frequency_login_days = train.avg_frequency_login_days.replace({'Error':np.nan})
train.avg_frequency_login_days = pd.to_numeric(train.avg_frequency_login_days)

test.avg_frequency_login_days = test.avg_frequency_login_days.replace({'Error':np.nan})
test.avg_frequency_login_days = pd.to_numeric(test.avg_frequency_login_days)

In [6]:
from sklearn.impute import SimpleImputer
imputer_avg_f_login_days = SimpleImputer(missing_values = np.nan, strategy='mean')
train.loc[:,'avg_frequency_login_days'] = imputer_avg_f_login_days.fit_transform(train.loc[:,'avg_frequency_login_days'].to_numpy().reshape(-1,1))

test.loc[:,'avg_frequency_login_days'] = imputer_avg_f_login_days.transform(test.loc[:,'avg_frequency_login_days'].to_numpy().reshape(-1,1))

In [7]:
for i in range(train.shape[0]): 
    if train.loc[i, 'joined_through_referral'] == '?' : 
        if train.loc[i, 'referral_id'] == 'xxxxxxxx' : 
            train.loc[i, 'joined_through_referral'] = 'No'
        else : 
            train.loc[i, 'joined_through_referral'] = 'Yes'

for i in range(test.shape[0]):           
    if test.loc[i, 'joined_through_referral'] == '?' : 
        if test.loc[i, 'referral_id'] == 'xxxxxxxx' : 
            test.loc[i, 'joined_through_referral'] = 'No'
        else : 
            test.loc[i, 'joined_through_referral'] = 'Yes'

In [8]:
cols  = ['region_category', 'preferred_offer_types']

imputer_cat = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
train.loc[:,cols] = imputer_cat.fit_transform(train.loc[:,cols])
test.loc[:,cols] = imputer_cat.fit_transform(test.loc[:,cols])

In [9]:
imputer_points_in_wallet = SimpleImputer(missing_values = np.nan, strategy = 'mean')

train.loc[:,'points_in_wallet'] = imputer_points_in_wallet.fit_transform(train.loc[:,'points_in_wallet'].to_numpy().reshape(-1,1))
test.loc[:,'points_in_wallet'] = imputer_points_in_wallet.fit_transform(test.loc[:,'points_in_wallet'].to_numpy().reshape(-1,1))

In [10]:
train = train.drop(['referral_id', 'joining_date', 'last_visit_time'], axis = 1)
test = test.drop(['referral_id', 'joining_date', 'last_visit_time'], axis = 1)

In [11]:
train.avg_time_spent = abs(train.avg_time_spent)
test.avg_time_spent = abs(test.avg_time_spent)

train.avg_frequency_login_days = abs(train.avg_frequency_login_days)
test.avg_frequency_login_days = abs(test.avg_frequency_login_days)

for i in range(train.shape[0]) :
    if train.loc[i,'days_since_last_login'] < 0 : 
        train.loc[i,'days_since_last_login'] = 0

for i in range(test.shape[0]) : 
    if test.loc[i,'days_since_last_login'] < 0 : 
        test.loc[i,'days_since_last_login'] = 0

# Encoding

In [12]:
X_train, y_train = train.iloc[:,:-1], train.iloc[:,-1]
X_test = test

In [13]:
cols_2_onehotencode = ['gender', 'region_category', 'membership_category',
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),cols_2_onehotencode)], remainder='passthrough')
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)


In [14]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
y_train = y_train.replace(-1,0)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from datetime import datetime

start = datetime.now()

param = { 'bootstrap': [True, False],
          'max_depth': [10, 30, 50, 70, 100, None],
          'max_features': ['auto', 'sqrt'],
          'min_samples_leaf': [1, 2, 4],
          'min_samples_split': [2, 5, 10],
          'n_estimators': [100, 200, 500]}

grids = GridSearchCV(RandomForestClassifier(), param_grid = param, cv = 5, scoring='f1_weighted', n_jobs = -1).fit(X_train, y_train)

end = datetime.now()

print('Time to fit the model', end - start)
print(cross_val_score(grids, X_train, y_train, cv = 10, scoring = 'f1_weighted'))

Time to fit the model 7:52:19.497896


KeyboardInterrupt: 

In [18]:
grids.best_params_

{'bootstrap': False,
 'max_depth': 50,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 200}

In [25]:
grids.best_score_

0.7371050682707125

In [26]:
model = RandomForestClassifier(bootstrap = False, max_depth=50,max_features='sqrt',
                               min_samples_leaf=2, min_samples_split=2, n_estimators=200).fit(X_train,y_train)

In [None]:
# from sklearn.metrics import accuracy_score, f1_score
# from sklearn.model_selection import cross_val_score

# cross_val_score(model,X_train, y_train,cv = 10, scoring='f1_weighted')

In [27]:
y_pred = model.predict(X_test)

In [28]:
output['churn_risk_score'] = y_pred

In [29]:
output.to_csv('F:\Projects\Hackathons\HE\churn_risk_rate\outputs\out2.csv', header = True, index = False)

In [30]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, model.predict(X_train))

array([[ 1141,     0,     5,    10,     0,     7],
       [    0,  2652,     0,     0,     0,     0],
       [    0,     0,  2741,     0,     0,     0],
       [    0,     0,     0, 10424,     0,     0],
       [    0,     0,     0,     0, 10185,     0],
       [    0,     0,     0,     0,     0,  9827]], dtype=int64)