## Task : Predict the churn score for a website based on the features provided in the dataset
https://www.hackerearth.com/challenges/competitive/hackerearth-machine-learning-challenge-predict-customer-churn/machine-learning/predict-the-churn-risk-rate-11-fb7a760d/

Churn rate is a marketing metric that describes the number of customers who leave a business over a specific time period. . Every user is assigned a prediction value that estimates their state of churn at any given time. This value is based on:

* User demographic information
* Browsing behavior
* Historical purchase data among other information 

It factors in our unique and proprietary predictions of how long a user will remain a customer. This score is updated every day for all users who have a minimum of one conversion. The values assigned are between 1 and 5.

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv(r'F:\Projects\Hackathons\HE\churn_risk_rate\dataset\train.csv')
test = pd.read_csv(r'F:\Projects\Hackathons\HE\churn_risk_rate\dataset\test.csv')
# Dropping unimportant columns

ouput = pd.DataFrame(columns = ['customer_id','churn_risk_score'])
cols_to_drop = ['customer_id', 'Name', 'security_no']
train = train.drop(cols_to_drop, axis = 1)
test = test.drop(cols_to_drop, axis = 1)


In [3]:
train.iloc[:,:11].head(4)

Unnamed: 0,age,gender,region_category,membership_category,joining_date,joined_through_referral,referral_id,preferred_offer_types,medium_of_operation,internet_option,last_visit_time
0,18,F,Village,Platinum Membership,2017-08-17,No,xxxxxxxx,Gift Vouchers/Coupons,?,Wi-Fi,16:08:02
1,32,F,City,Premium Membership,2017-08-28,?,CID21329,Gift Vouchers/Coupons,Desktop,Mobile_Data,12:38:13
2,44,F,Town,No Membership,2016-11-11,Yes,CID12313,Gift Vouchers/Coupons,Desktop,Wi-Fi,22:53:21
3,37,M,City,No Membership,2016-10-29,Yes,CID3793,Gift Vouchers/Coupons,Desktop,Mobile_Data,15:57:50


In [4]:
train.iloc[:,11:].head(4)

Unnamed: 0,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,17,300.63,53005.25,17.0,781.75,Yes,Yes,No,Not Applicable,Products always in Stock,2
1,16,306.34,12838.38,10.0,,Yes,No,Yes,Solved,Quality Customer Care,1
2,14,516.16,21027.0,22.0,500.69,No,Yes,Yes,Solved in Follow-up,Poor Website,5
3,11,53.27,25239.56,6.0,567.66,No,Yes,Yes,Unsolved,Poor Website,5


In [6]:
cols = ['gender',
       'region_category', 'membership_category', 'joined_through_referral','preferred_offer_types',
       'medium_of_operation', 'internet_option', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score']
for i in cols : 
    print('---------|', i ,' |------------')
    print(train[i].value_counts())
    print('\n==============================================================================\n')

---------| gender  |------------
F          18490
M          18443
Unknown       59
Name: gender, dtype: int64


---------| region_category  |------------
Town       14128
City       12737
Village     4699
Name: region_category, dtype: int64


---------| membership_category  |------------
Basic Membership       7724
No Membership          7692
Gold Membership        6795
Silver Membership      5988
Premium Membership     4455
Platinum Membership    4338
Name: membership_category, dtype: int64


---------| joined_through_referral  |------------
No     15839
Yes    15715
?       5438
Name: joined_through_referral, dtype: int64


---------| preferred_offer_types  |------------
Gift Vouchers/Coupons       12349
Credit/Debit Card Offers    12274
Without Offers              12081
Name: preferred_offer_types, dtype: int64


---------| medium_of_operation  |------------
Desktop       13913
Smartphone    13876
?              5393
Both           3810
Name: medium_of_operation, dtype: int64


---

In [7]:
train.isna().sum()[train.isna().sum() > 0]

region_category          5428
preferred_offer_types     288
points_in_wallet         3443
dtype: int64

In [8]:
test.isna().sum()[test.isna().sum() > 0]

region_category          2948
preferred_offer_types     159
points_in_wallet         1963
dtype: int64

In [9]:
train.gender.value_counts()

F          18490
M          18443
Unknown       59
Name: gender, dtype: int64

### Missing Values

In [10]:
len(train.gender[train.gender == 'Unknown'])

59

In [11]:
len(train.joined_through_referral[train.joined_through_referral == '?'])

5438

In [12]:
len(train.medium_of_operation[train.medium_of_operation == '?'])

5393

In [13]:
train[train.avg_frequency_login_days == 'Error'].shape

(3522, 22)

### Imputation for Missing Values

In [14]:
from sklearn.impute import SimpleImputer
imputer_gender = SimpleImputer(missing_values = 'Unknown', strategy='most_frequent')
train.loc[:,'gender'] = imputer_gender.fit_transform(train.loc[:,'gender'].to_numpy().reshape(-1,1))

test.loc[:,'gender'] = imputer_gender.transform(test.loc[:,'gender'].to_numpy().reshape(-1,1))

In [15]:
from sklearn.impute import SimpleImputer
imputer_medium_of_operation = SimpleImputer(missing_values = '?', strategy='most_frequent')
train.loc[:,'medium_of_operation'] = imputer_medium_of_operation.fit_transform(train.loc[:,'medium_of_operation'].to_numpy().reshape(-1,1))
test.loc[:,'medium_of_operation'] = imputer_medium_of_operation.transform(test.loc[:,'medium_of_operation'].to_numpy().reshape(-1,1))

In [16]:
train. avg_frequency_login_days = train.avg_frequency_login_days.replace({'Error':np.nan})
train.avg_frequency_login_days = pd.to_numeric(train.avg_frequency_login_days)

test.avg_frequency_login_days = test.avg_frequency_login_days.replace({'Error':np.nan})
test.avg_frequency_login_days = pd.to_numeric(test.avg_frequency_login_days)

In [17]:
from sklearn.impute import SimpleImputer
imputer_avg_f_login_days = SimpleImputer(missing_values = np.nan, strategy='mean')
train.loc[:,'avg_frequency_login_days'] = imputer_avg_f_login_days.fit_transform(train.loc[:,'avg_frequency_login_days'].to_numpy().reshape(-1,1))

test.loc[:,'avg_frequency_login_days'] = imputer_avg_f_login_days.transform(test.loc[:,'avg_frequency_login_days'].to_numpy().reshape(-1,1))

In [18]:
for i in range(train.shape[0]): 
    if train.loc[i, 'joined_through_referral'] == '?' : 
        if train.loc[i, 'referral_id'] == 'xxxxxxxx' : 
            train.loc[i, 'joined_through_referral'] = 'No'
        else : 
            train.loc[i, 'joined_through_referral'] = 'Yes'

for i in range(test.shape[0]):           
    if test.loc[i, 'joined_through_referral'] == '?' : 
        if test.loc[i, 'referral_id'] == 'xxxxxxxx' : 
            test.loc[i, 'joined_through_referral'] = 'No'
        else : 
            test.loc[i, 'joined_through_referral'] = 'Yes'

In [19]:
cols  = ['region_category', 'preferred_offer_types']

imputer_cat = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
train.loc[:,cols] = imputer_cat.fit_transform(train.loc[:,cols])
test.loc[:,cols] = imputer_cat.fit_transform(test.loc[:,cols])

In [20]:
imputer_points_in_wallet = SimpleImputer(missing_values = np.nan, strategy = 'mean')

train.loc[:,'points_in_wallet'] = imputer_points_in_wallet.fit_transform(train.loc[:,'points_in_wallet'].to_numpy().reshape(-1,1))
test.loc[:,'points_in_wallet'] = imputer_points_in_wallet.fit_transform(test.loc[:,'points_in_wallet'].to_numpy().reshape(-1,1))

In [21]:
train = train.drop(['referral_id', 'joining_date', 'last_visit_time'], axis = 1)
test = test.drop(['referral_id', 'joining_date', 'last_visit_time'], axis = 1)

In [22]:
train.avg_time_spent = abs(train.avg_time_spent)
test.avg_time_spent = abs(test.avg_time_spent)

train.avg_frequency_login_days = abs(train.avg_frequency_login_days)
test.avg_frequency_login_days = abs(test.avg_frequency_login_days)

for i in range(train.shape[0]) :
    if train.loc[i,'days_since_last_login'] < 0 : 
        train.loc[i,'days_since_last_login'] = 0

for i in range(test.shape[0]) : 
    if test.loc[i,'days_since_last_login'] < 0 : 
        test.loc[i,'days_since_last_login'] = 0

# Encoding

In [23]:
# import category_encoders as ce
# from category_encoders import TargetEncoder

# encoder_membership = ce.TargetEncoder(cols= 'membership_category') 
# train.loc[:,'membership_category'] = encoder_membership.fit_transform(train['membership_category'],train['churn_risk_score'])
# test.loc[:,'membership_category'] = encoder_membership.transform(test.loc[:,'membership_category'])

# encoder_complaint = ce.TargetEncoder(cols= 'complaint_status') 
# train.loc[:,'complaint_status'] = encoder_complaint.fit_transform(train['complaint_status'],train['churn_risk_score'])
# test.loc[:,'complaint_status'] = encoder_complaint.transform(test.loc[:,'complaint_status'])

# encoder_feedback = ce.TargetEncoder(cols= 'feedback') 
# train.loc[:,'feedback'] = encoder_feedback.fit_transform(train['feedback'],train['churn_risk_score'])
# test.loc[:,'feedback'] = encoder_feedback.transform(test.loc[:,'feedback'])

In [24]:
X_train, y_train = train.iloc[:,:-1], train.iloc[:,-1]
X_test = test

In [26]:
cols_2_onehotencode = ['gender', 'region_category', 'membership_category',
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),cols_2_onehotencode)], remainder='passthrough')
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)


In [27]:
X_train

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        5.30052500e+04, 1.70000000e+01, 7.81750000e+02],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        1.28383800e+04, 1.00000000e+01, 6.86882199e+02],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.10270000e+04, 2.20000000e+01, 5.00690000e+02],
       ...,
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.81275600e+04, 1.59767152e+01, 6.80470000e+02],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        2.37886000e+03, 2.00000000e+01, 1.97264414e+02],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        2.18968000e+03, 1.59767152e+01, 7.19970000e+02]])

In [28]:
X_test

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.07214400e+04, 7.00000000e+00, 7.33830000e+02],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        9.64440000e+03, 9.00000000e+00, 7.26000000e+02],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.69325000e+03, 2.10000000e+01, 7.13780000e+02],
       ...,
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        1.79037500e+04, 2.40000000e+01, 5.64300000e+02],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        1.40570900e+04, 2.20000000e+01, 6.06340000e+02],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        3.67864400e+04, 1.59767152e+01, 1.19368981e+03]])

In [29]:
X_train.shape

(36992, 48)

In [30]:
X_test.shape

(19919, 48)

In [41]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)