## 1. Import libraries

In [1]:
import numpy as np
import pandas as pd

# machine learning library
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor


#import XGBOOST Libraries
#import xgboost as xgb
from sklearn.grid_search import GridSearchCV

#Configure Panda
pd.options.display.width = 200



## 2. Loading and pre-processing data
!!! ATTENTION: In order to load the following files you first need to completely run the latest versions of Feature_Engineering_Members.ipynb and Feature_Engineering_Transactions.ipynb. !!!
### 2.1 Load the files

In [2]:
#Load data in
# From train.csv, we will extract the is_churn and use it as the y-label for training. 
train = pd.read_csv('data/train.csv')

# From train_v2.csv (the churn data for march), we will extract the is_churn and use it as the y-label for training
test_march = pd.read_csv('data/train_v2.csv')
# From sample_submission_v2.csv, we will extract the msno's 
test = pd.read_csv('data/sample_submission_v2.csv')

# This is the input for our model
transactions = pd.read_csv('data/final_transactions.csv')
final_members=pd.read_csv('data/final_members.csv')

### 2.2 Merge test and test_label
To get the exact number of rows required for the submission file. 
NB! Some msno from sample_submission_v2 does not have a prediction for March, for the msno's without a prediction, the is_churn is set to 0. 

In [3]:
test_merged = pd.merge(test,test_march, on='msno', how='left')
test_label = test_merged
print(test_label.head())
#Drop the is_churn column from sample_submission_v2.csv (all zeros)
test_label = test_label.drop('is_churn_x', axis=1)

#Some msno does not have a prediction for march, set all NaN values to 0.
test_label = test_label.fillna(0)
print(test_label.shape)
print(test_label.head())

                                           msno  is_churn_x  is_churn_y
0  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=           0         0.0
1  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=           0         0.0
2  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=           0         0.0
3  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=           0         0.0
4  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=           0         0.0
(907471, 2)
                                           msno  is_churn_y
0  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=         0.0
1  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=         0.0
2  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=         0.0
3  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=         0.0
4  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=         0.0


### 2.3 Merge the different files

In [4]:
# Creating datasets witgh input&outputs
train_data = pd.merge(train,final_members,on='msno',how='left')
train_data = pd.merge(train_data,transactions,how='left',on='msno',left_index=True, right_index=True)

# Creating datasets with only inputs
# Note that the submission_v2.csv file does NOT contain ouputs (they are all 0)
test_data = pd.merge(test_label,final_members,on='msno',how='left')
test_data = pd.merge(test_data,transactions,how='left',on='msno',left_index=True, right_index=True)

#print(data.shape)

### 2.4 Data cleaning

In [5]:
#check for null values
print(train_data.isnull().sum())

msno                         0
is_churn                     0
bd                      115770
city_1                  115770
city_3                  115770
city_4                  115770
city_5                  115770
city_6                  115770
city_7                  115770
city_8                  115770
city_9                  115770
city_10                 115770
city_11                 115770
city_12                 115770
city_13                 115770
city_14                 115770
city_15                 115770
city_16                 115770
city_17                 115770
city_18                 115770
city_19                 115770
city_20                 115770
city_21                 115770
city_22                 115770
reg_year_2012           115770
reg_year_2013           115770
reg_year_2014           115770
reg_year_2015           115770
reg_year_2016           115770
reg_year_2017           115770
                         ...  
plan_list_price_150          0
plan_lis

In [6]:
#Get rid of null-values

#For train data
#Set city null values to o in train data
cities = ['city_1','city_3','city_4','city_5','city_6','city_7','city_8','city_9','city_10','city_11','city_12','city_13','city_14','city_15','city_16','city_17','city_18','city_19','city_20','city_21','city_22']
for i in range(0,len(cities)):
        inpt = cities[i]
        train_data[inpt]=train_data[inpt].fillna(value=0)
        test_data[inpt]=test_data[inpt].fillna(value=0)
        
reg_dates=['reg_year_2012','reg_year_2013','reg_year_2014','reg_year_2015','reg_year_2016','reg_year_2017']        
for i in range(0,len(reg_dates)):
        inpt = reg_dates[i]
        train_data[inpt]=train_data[inpt].fillna(value=0)
        test_data[inpt]=test_data[inpt].fillna(value=0)

reg_meth=['reg_3','reg_4','reg_7','reg_9','reg_11']
for i in range(0,len(reg_meth)):
        inpt = reg_meth[i]
        train_data[inpt]=train_data[inpt].fillna(value=0)
        test_data[inpt]=test_data[inpt].fillna(value=0)     
        
#train_data['bd_norm ']=train_data['bd_norm'].fillna(value=0)
#test_data['bd_norm']=test_data['bd_norm'].fillna(value=0)     

#Set registration dates null values to o in train data
#train_data[['reg_year_2017']] = train_data[['reg_year_2017']].fillna(value=0)
#test_data[['reg_year_2017']] = test_data[['reg_year_2017']].fillna(value=0)

#check for null values
print(train_data.isnull().sum())
print(test_data.isnull().sum())

msno                         0
is_churn                     0
bd                      115770
city_1                       0
city_3                       0
city_4                       0
city_5                       0
city_6                       0
city_7                       0
city_8                       0
city_9                       0
city_10                      0
city_11                      0
city_12                      0
city_13                      0
city_14                      0
city_15                      0
city_16                      0
city_17                      0
city_18                      0
city_19                      0
city_20                      0
city_21                      0
city_22                      0
reg_year_2012                0
reg_year_2013                0
reg_year_2014                0
reg_year_2015                0
reg_year_2016                0
reg_year_2017                0
                         ...  
plan_list_price_150          0
plan_lis

In [7]:
#Drop unwanted columns (this may change with new insights or new code!)
unwanted = ['msno','transaction_date','membership_expire_date']
#Before dropping the msno of test, we need to save it for the sumission
msno = test_march.msno
train_data = train_data.drop(unwanted, axis=1)
test_data = test_data.drop(unwanted, axis=1)

In [8]:
# Splitting input/output data into train and test sets in order to check efficiency of our models
data_input = train_data.drop('is_churn',axis=1)
data_output = train_data['is_churn']
print(data_input.shape)
print(data_output.shape)

# Removing is_churn (as it's all dummy zeros) from test data
test_input = test_data.drop('is_churn_y',axis=1)
test_output = test_label['is_churn_y']
print(test_input.shape)
print(test_output.shape)

(992931, 113)
(992931,)
(907471, 113)
(907471,)


## 3. Run xgboost

In [18]:
#XG Boost
import xgboost as xgb
dtrain = xgb.DMatrix(data_input, label = data_output)
dtest = xgb.DMatrix(test_input, label = test_output)
print('Done.')
param = {
    'max_depth': 4,  # the maximum depth of each tree. Try with max_depth: 2 to 10.
    'eta': 0.3,  # the training step for each iteration. Try with ETA: 0.1, 0.2, 0.3...
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations. Try with num_round around few hundred!
#----------------
bst = xgb.train(param, dtrain, num_round)
print('Modeling done!')

y_pred_xgb = bst.predict(dtest)
print('Prediction done!')

best_preds = np.asarray([np.argmax(line) for line in y_pred_xgb])

y_pred_xgb = y_pred_xgb[:,1] #Column 2 out of 3

Done.
Modeling done!
Prediction done!


In [19]:
from  sklearn.metrics import log_loss
y = test_data[test_data.columns[0]]
print("Logloss for XGBoost is: %.3f"%log_loss(y,y_pred_xgb))

Logloss for XGBoost is: 0.097


4. Creating submission file

In [20]:
#Prepare submission file
my_submission = pd.DataFrame({'msno': msno, 'is_churn': y_pred_xgb})
#think about changing y_pred_1 if you want to use a diffrenet algorithm
#y_pred_1 was one of the reasons for the previous bug
print(my_submission.head())
cols = my_submission.columns.tolist()
cols = cols[-1:] + cols[:-1]
my_submission = my_submission[cols]
print(my_submission.head())
print(my_submission.count())

my_submission.to_csv('submission_3.csv', index=False)
print('Done! :-)')

   is_churn                                          msno
0  0.016168  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=
1  0.058358  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=
2  0.040174  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=
3  0.046575  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=
4  0.167030  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=
                                           msno  is_churn
0  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=  0.016168
1  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=  0.058358
2  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=  0.040174
3  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=  0.046575
4  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=  0.167030
msno        907471
is_churn    907471
dtype: int64
Done! :-)


### 4. Cross validation

In [11]:
#Impoirt libraries for cross validation
import xgboost as xgb

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

### Cross Validation with XGBoost (really slow, 30 - 45 minutes)

In [12]:

#Create the pipeline
my_pipeline = make_pipeline(Imputer(), xgb.XGBClassifier())
print('Pipeline created.')
#Get cross validation scores
scores = cross_val_score(my_pipeline, data_input, data_output, scoring='neg_log_loss', cv=3)
print(scores)

#Single measure of model quality
print('Log Loss %2f' %(-1 * scores.mean()))


Pipeline created.
[-0.21899533 -0.21797433 -0.21844548]
Log Loss 0.218472
