# Algorithm Solution (for transactions)

## 1. Loading libraries

In [1]:
import numpy as np
import pandas as pd

# machine learning library
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor


#import XGBOOST Libraries
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

#Configure Panda
pd.options.display.width = 200



## 2. Loading and pre-processing data
### !!! ATTENTION: In order to load the following files you first need to completely run the latest versions of Feature_Engineering_Members.ipynb and Feature_Engineering_Transactions.ipynb. !!!
### 2.1 Load the files

In [2]:
#Load data in
train = pd.read_csv('data/train_v2.csv')
test = pd.read_csv('data/sample_submission_v2.csv')
transactions = pd.read_csv('data/final_transactions.csv')
final_members=pd.read_csv('data/final_members.csv')

### 2.2 Merge the different files

In [3]:
# Creating datasets witgh input&outputs
train_data = pd.merge(train,final_members,on='msno',how='left')
train_data = pd.merge(train_data,transactions,how='left',on='msno',left_index=True, right_index=True)

# Creating datasets with only inputs
# Note that the submission_v2.csv file does NOT contain ouputs (they are all 0)
test_data = pd.merge(test,final_members,on='msno',how='left')
test_data = pd.merge(test_data,transactions,how='left',on='msno',left_index=True, right_index=True)

#print(data.shape)

### 2.3 Data cleaning

In [4]:
#check for null values
print(train_data.isnull().sum())

msno                           0
is_churn                       0
city_1                    109993
city_3                    109993
city_4                    109993
city_5                    109993
city_6                    109993
city_7                    109993
city_8                    109993
city_9                    109993
city_10                   109993
city_11                   109993
city_12                   109993
city_13                   109993
city_14                   109993
city_15                   109993
city_16                   109993
city_17                   109993
city_18                   109993
city_19                   109993
city_20                   109993
city_21                   109993
city_22                   109993
reg_name                  109993
reg_month                 109993
actual_amount_paid             0
is_auto_renew                  0
transaction_date               0
membership_expire_date         0
is_cancel                      0
          

In [5]:
#Get rid of null-values

#For train data
#Set city null values to o in train data
cities = ['city_1','city_3','city_4','city_5','city_6','city_7','city_8','city_9','city_10','city_11','city_12','city_13','city_14','city_15','city_16','city_17','city_18','city_19','city_20','city_21','city_22']
for i in range(0,len(cities)):
        inpt = cities[i]
        train_data[inpt]=train_data[inpt].fillna(value=0)
        test_data[inpt]=test_data[inpt].fillna(value=0)

#Set registration dates null values to o in train data
train_data[['reg_name','reg_month']] = train_data[['reg_name','reg_month']].fillna(value=0)
test_data[['reg_name','reg_month']] = test_data[['reg_name','reg_month']].fillna(value=0)

#check for null values
print(train_data.isnull().sum())
print(test_data.isnull().sum())


msno                      0
is_churn                  0
city_1                    0
city_3                    0
city_4                    0
city_5                    0
city_6                    0
city_7                    0
city_8                    0
city_9                    0
city_10                   0
city_11                   0
city_12                   0
city_13                   0
city_14                   0
city_15                   0
city_16                   0
city_17                   0
city_18                   0
city_19                   0
city_20                   0
city_21                   0
city_22                   0
reg_name                  0
reg_month                 0
actual_amount_paid        0
is_auto_renew             0
transaction_date          0
membership_expire_date    0
is_cancel                 0
                         ..
plan_list_price_150       0
plan_list_price_180       0
plan_list_price_210       0
plan_list_price_265       0
plan_list_price_298 

In [6]:
#Drop unwanted columns (this may change with new insights or new code!)
unwanted = ['msno','actual_amount_paid','transaction_date','membership_expire_date', 'diff_plan_actual','reg_name','reg_name']
#Before dropping the msno of test, we need to save it for the sumission
msno = test.msno
train_data = train_data.drop(unwanted, axis=1)
test_data = test_data.drop(unwanted, axis=1)


In [7]:
# Splitting input/output data into train and test sets in order to check efficiency of our models
data_input = train_data.drop('is_churn',axis=1)
data_output = train_data['is_churn']
print(data_input.shape)
print(data_output.shape)
x_train, x_test, y_train, y_test = train_test_split(data_input, data_output, test_size=0.2, random_state=42)

# Removing is_churn (as it's all dummy zeros) from test data
test_input = test_data.drop('is_churn',axis=1)

(970960, 142)
(970960,)


## 3. Prediction models

### 4.1. Testing out prediction models

In [8]:
# Random Forrest, no training data
model = RandomForestRegressor()
model.fit(x_train, y_train)
print('Model fitted!')
y_pred_f = model.predict(x_test)
print('Prediction done!')
print("Logloss for Random Forrest is: %.2f"%log_loss(y_test,y_pred_f))

## I think we should use the logloss to measure our accuracy, as it is the same then they use on Kaggle. Also, to use accuracy, we need our input to be only 0's and 1's, thus it is not a very accurate assesment description of our model
#predictions = [round(value) for value in y_pred]
#accuracy = accuracy_score(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

Model fitted!
Prediction done!
Logloss for Random Forrest is: 0.40


In [9]:
# Linear Regression
model = LinearRegression()
model.fit(x_train, y_train)
print('Model fitted!')
# Make predicitons for test data
y_pred_l = model.predict(x_test)
y_pred_l = np.absolute(y_pred_l)
print('Prediction done!')
print("Logloss for Linear Regression is: %.2f"%log_loss(y_test,y_pred_l))

Model fitted!
Prediction done!
Logloss for Linear Regression is: 0.30


In [20]:
model.predict_proba(x_test)

AttributeError: 'LinearRegression' object has no attribute 'predict_proba'

# Run AdaBoost
model_abr = AdaBoostRegressor()
model_abr.fit(x_train, y_train)
y_pred_abr = model_abr.predict(x_test)
print("Logloss for AdaBoost is: %.2f"%log_loss(y_test,y_pred_abr))

# Run SGDRegressor (really bad)
model_sgd = SGDRegressor()
#model_sgd.fit(x_train, y_train)
#y_pred_sgd = model_sgd.predict(x_test)
print("Logloss for SGDRegressor is: %.2f"%log_loss(y_test,y_pred_sgd))

# Run GradientBoostingRegressor (very slow)
model_gbr = GradientBoostingRegressor()
#model_gbr.fit(x_train, y_train)
#y_pred_gbr = model_gbr.predict(x_test)
print("Logloss for GradientBoostingRegressor is: %.2f"%log_loss(y_test,y_pred_gbr))

In [10]:
#XG Boost
dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test, label = y_test)
print('Done.')
param = {
    'max_depth': 3,  # the maximum depth of each tree. Try with max_depth: 2 to 10.
    'eta': 0.3,  # the training step for each iteration. Try with ETA: 0.1, 0.2, 0.3...
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations. Try with num_round around few hundred!
#----------------
bst = xgb.train(param, dtrain, num_round)
print('Modeling done!')

y_pred_xgb = bst.predict(dtest)
print('Prediction done!')

best_preds = np.asarray([np.argmax(line) for line in y_pred_xgb])

y_pred_xgb = y_pred_xgb[:,1] #Column 2 out of 3

Done.
Modeling done!
Prediction done!


from  sklearn.metrics import log_loss
print("Logloss for XGBoost is: %.3f"%log_loss(y_test,y_pred_xgb))

### 3.2. Running the best model on all the data

In [11]:
# Linear Regression
model.fit(data_input, data_output)
y_pred_l = model.predict(test_input)
y_pred_l = np.absolute(y_pred_l)

## Cross validation

In [12]:
#Impoirt libraries for cross validation
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

### Cross-Validation for LinearReression
Tested with both cv =3 (33.33% of training set kept for validation) and cv=5 (20 % kept for validation). Both gives a MAE (Mean Absolute Error) of around 0.16, as more folds increase computation time I tink it is sufficient with 3 folds. 

LinearRegression and RandomForest regressor 

In [14]:
#Create the pipeline
my_pipeline = make_pipeline(Imputer(), LinearRegression())

#Get cross validation scores
scores = cross_val_score(my_pipeline, data_input, data_output, scoring='neg_mean_absolute_error', cv=3)
print(scores)

#Single measure of model quality
print('Mean Absolute Error %2f' %(-1 * scores.mean()))

[-0.19666584 -0.16938437 -0.13234097]
Mean Absolute Error 0.166130


### Cross-Validation for RandomForestRegressor (NB! Slow)

In [19]:
#Create the pipeline
my_pipeline = make_pipeline(Imputer(), RandomForestRegressor())
print('Pipeline created.')
#Get cross validation scores
scores = cross_val_score(my_pipeline, data_input, data_output, scoring='neg_log_loss', cv=3)
print(scores)

#Single measure of model quality
print('Log Loss %2f' %(-1 * scores.mean()))

Pipeline created.


AttributeError: 'RandomForestRegressor' object has no attribute 'predict_proba'

### Cross Validation with XGBoost (really slow, log_loss =  0.295109)
Computing the log_loss using cross validationgives the same error as when we compute the log_loss when we split the data with train_test_split. 

In [18]:
#Create the pipeline
my_pipeline = make_pipeline(Imputer(), xgb.XGBClassifier())
print('Pipeline created.')
#Get cross validation scores
scores = cross_val_score(my_pipeline, data_input, data_output, scoring='neg_log_loss', cv=3)
print(scores)

#Single measure of model quality
print('Log Loss %2f' %(-1 * scores.mean()))

Pipeline created.
[-0.29521446 -0.2949719  -0.29514077]
Mean Absolute Error 0.295109


In [22]:
print(np.var(scores))

1.0309626269710118e-08


## 4. Creating submission file

In [45]:
#Prepare submission file
my_submission = pd.DataFrame({'msno': msno, 'is_churn': y_pred_l})
#think about changing y_pred_1 if you want to use a diffrenet algorithm
#y_pred_1 was one of the reasons for the previous bug
print(my_submission.head())
cols = my_submission.columns.tolist()
cols = cols[-1:] + cols[:-1]
my_submission = my_submission[cols]
print(my_submission.head())
print(my_submission.count())

my_submission.to_csv('submission.csv', index=False)
print('Done! :-)')

   is_churn                                          msno
0  0.056911  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=
1  0.122503  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=
2  0.121942  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=
3  0.054859  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=
4  0.055925  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=
                                           msno  is_churn
0  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=  0.056911
1  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=  0.122503
2  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=  0.121942
3  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=  0.054859
4  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=  0.055925
msno        907471
is_churn    907471
dtype: int64
Done! :-)
