# Algorithm Solution (for transactions)

## 1. Loading libraries

In [17]:
import numpy as np
import pandas as pd

# machine learning library
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import log_loss

#import XGBOOST Libraries
#import xgboost as xgb
from sklearn.grid_search import GridSearchCV

#Configure Panda
pd.options.display.width = 200

## 2. Loading and pre-processing data
### !!! ATTENTION: In order to load the following files you first need to completely run Feature_Engineering_Members.ipynb and Feature_Engineering_Transactions.ipynb. !!!
### 2.1 Load the files

In [18]:
#Load data in
train = pd.read_csv('data/train_v2.csv')
test = pd.read_csv('data/sample_submission_v2.csv')
transactions = pd.read_csv('data/final_transactions.csv')
final_members=pd.read_csv('data/final_members.csv')
user_logs = pd.read_csv('data/final_user_logs.csv')

In [19]:
print(final_members.shape)
print(transactions.shape)
print(user_logs.shape)
print(train.shape)

(6769473, 24)
(1431009, 125)
(495677, 7)
(970960, 2)


In [20]:
print('The train data as',train['msno'].count(),'different msnos')

The train data as 970960 different msnos


### 2.2 Merge the different files

In [38]:
# Creating datasets witgh input&outputs
train_data = pd.merge(train, final_members, on='msno', how='left')
train_data = pd.merge(train_data, transactions, how='left',on='msno',left_index=True, right_index=True)
#train_data = pd.merge(train_data, user_logs, how='left',on='msno',left_index=True, right_index=True)

train_data_logs = pd.merge(train, final_members, on='msno', how='left')
train_data_logs = pd.merge(train_data_logs, transactions, how='left',on='msno',left_index=True, right_index=True)
train_data_logs = pd.merge(train_data_logs, user_logs, on='msno',left_index=True, right_index=True)

# Creating datasets with only inputs
# Note that the submission_v2.csv file does NOT contain ouputs (they are all 0)
test_data = pd.merge(test,final_members, on='msno', how='left')
test_data = pd.merge(test_data,transactions, how='left',on='msno',left_index=True, right_index=True)

test_data_logs = pd.merge(test,final_members, on='msno', how='left')
test_data_logs = pd.merge(test_data,transactions, how='left',on='msno',left_index=True, right_index=True)
test_data_logs = pd.merge(test_data, user_logs ,how='left',on='msno',left_index=True, right_index=True)


In [39]:
print(test_data.shape)
print(train_data.shape)
print(train_data_logs.shape)

(907471, 149)
(970960, 149)
(495677, 155)


In [40]:
print('The train data has',train['msno'].count(),'different msnos')
print('The test data has',test['msno'].count(),'different msnos')
print('The members data has',final_members['msno'].count(),'different msnos')
print('The transactions data has',transactions['msno'].count(),'different msnos')
print('The user logs data has',user_logs['msno'].count(),'different msnos')
print('Compared to that, the altered test data, excluding logs, has',test_data['msno'].count(),'different msnos')

The train data has 970960 different msnos
The test data has 907471 different msnos
The members data has 6769473 different msnos
The transactions data has 1431009 different msnos
The user logs data has 495677 different msnos
Compared to that, the altered test data, excluding logs, has 907471 different msnos


### 2.3 Data cleaning

In [41]:
#check for null values
print(train_data.isnull().sum())
print(train_data_logs.isnull().sum())

msno                           0
is_churn                       0
city_1                    109993
city_3                    109993
city_4                    109993
city_5                    109993
city_6                    109993
city_7                    109993
city_8                    109993
city_9                    109993
city_10                   109993
city_11                   109993
city_12                   109993
city_13                   109993
city_14                   109993
city_15                   109993
city_16                   109993
city_17                   109993
city_18                   109993
city_19                   109993
city_20                   109993
city_21                   109993
city_22                   109993
reg_name                  109993
reg_month                 109993
actual_amount_paid             0
is_auto_renew                  0
transaction_date               0
membership_expire_date         0
is_cancel                      0
          

In [42]:
#Get rid of null-values

#For train data
#Set city null values to o in train data
cities = ['city_1','city_3','city_4','city_5','city_6','city_7','city_8','city_9','city_10','city_11','city_12','city_13','city_14','city_15','city_16','city_17','city_18','city_19','city_20','city_21','city_22']
for i in range(0,len(cities)):
        inpt = cities[i]
        train_data[inpt]=train_data[inpt].fillna(value=0)
        test_data[inpt]=test_data[inpt].fillna(value=0)

user_logs_array = ['num_100', 'entries', 'num25_ratio', 'num50_ratio', 'num75_ratio', 'num985_ratio']
#for i in range(0,len(user_logs_array)):
 #       inpt = user_logs_array[i]
  #      train_data[inpt]=train_data[inpt].fillna(value=0)
   #     test_data[inpt]=test_data[inpt].fillna(value=0)

#Set registration dates null values to o in train data
train_data[['reg_name','reg_month']] = train_data[['reg_name','reg_month']].fillna(value=0)
test_data[['reg_name','reg_month']] = test_data[['reg_name','reg_month']].fillna(value=0)

#check for null values
print(train_data.isnull().sum())
print(test_data.isnull().sum())


msno                      0
is_churn                  0
city_1                    0
city_3                    0
city_4                    0
city_5                    0
city_6                    0
city_7                    0
city_8                    0
city_9                    0
city_10                   0
city_11                   0
city_12                   0
city_13                   0
city_14                   0
city_15                   0
city_16                   0
city_17                   0
city_18                   0
city_19                   0
city_20                   0
city_21                   0
city_22                   0
reg_name                  0
reg_month                 0
actual_amount_paid        0
is_auto_renew             0
transaction_date          0
membership_expire_date    0
is_cancel                 0
                         ..
plan_list_price_150       0
plan_list_price_180       0
plan_list_price_210       0
plan_list_price_265       0
plan_list_price_298 

In [43]:
#Drop unwanted columns (this may change with new insights or new code!)
unwanted = ['msno','actual_amount_paid','transaction_date','membership_expire_date', 'diff_plan_actual','reg_name','reg_name']
train_data = train_data.drop(unwanted, axis=1)
test_data = test_data.drop(unwanted, axis=1)


In [44]:
train_data_logs = train_data_logs.dropna()
train_data_reduced = train_data_logs.drop(columns=['num_100', 'entries', 'num25_ratio', 'num50_ratio', 'num75_ratio', 'num985_ratio'],axis=1)
print(train_data_logs.shape)
print(train_data_reduced.shape)

(440062, 155)
(440062, 149)


In [45]:
train_data_logs = train_data_logs.drop(unwanted, axis=1)
train_data_reduced = train_data_reduced.drop(unwanted, axis=1)

In [46]:
# Splitting input/output data into train and test sets in order to check efficiency of our models
data_input = train_data.drop('is_churn',axis=1)
data_output = train_data['is_churn']
print(data_input.shape)
print(data_output.shape)
x_train, x_test, y_train, y_test = train_test_split(data_input, data_output, test_size=0.2, random_state=42)

# Removing is_churn (as it's all dummy zeros) from test data
test_input = test_data.drop('is_churn',axis=1)

# for user_logs data
data_input_ul = train_data_logs.drop('is_churn',axis=1)
data_output_ul = train_data_logs['is_churn']
print(data_input_ul.shape)
print(data_output_ul.shape)
x_train_ul, x_test_ul, y_train_ul, y_test_ul = train_test_split(data_input_ul, data_output_ul, test_size=0.2, random_state=42)

# for user_logs data
data_input_r = train_data_reduced.drop('is_churn',axis=1)
data_output_r = train_data_reduced['is_churn']
print(data_input_r.shape)
print(data_output_r.shape)
x_train_r, x_test_r, y_train_r, y_test_r = train_test_split(data_input_r, data_output_r, test_size=0.2, random_state=42)

(970960, 142)
(970960,)
(440062, 148)
(440062,)
(440062, 142)
(440062,)


## 3. Prediction models

### 4.1. Testing out prediction models

In [15]:
# Random Forrest, no training data
model = RandomForestRegressor()
model.fit(x_train, y_train)
print('Model fitted!')
y_pred_f = model.predict(x_test)
print('Prediction done!')
print("Logloss for Random Forrest is: %.2f"%log_loss(y_test,y_pred_f))

## I think we should use the logloss to measure our accuracy, as it is the same then they use on Kaggle. Also, to use accuracy, we need our input to be only 0's and 1's, thus it is not a very accurate assesment description of our model
#predictions = [round(value) for value in y_pred]
#accuracy = accuracy_score(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [75]:
# Random Forest
# user_logs included
model = RandomForestRegressor()
model.fit(x_train_ul, y_train_ul)
print('Model fitted!')
y_pred_f = model.predict(x_test_ul)
print('Prediction done!')
print("Logloss for Random Forrest, incl. user_logs, is: %.2f"%log_loss(y_test_ul,y_pred_f))

#vs.
model = RandomForestRegressor()
model.fit(x_train_r, y_train_r)
print('Model fitted!')
y_pred_f = model.predict(x_test_r)
print('Prediction done!')
print("Logloss for Random Forrest, user_logs size but no user_logs, is: %.2f"%log_loss(y_test_r,y_pred_f))

Model fitted!
Prediction done!
Logloss for Random Forrest, incl. user_logs, is: 1.34
Model fitted!
Prediction done!
Logloss for Random Forrest, user_logs size but no user_logs, is: 0.51


In [34]:
# Linear Regression
model = LinearRegression()
model.fit(x_train, y_train)
print('Model fitted!')
# Make predicitons for test data
y_pred_l = model.predict(x_test)
y_pred_l = np.absolute(y_pred_l)
print('Prediction done!')
print("Logloss for Linear Regression is: %.2f"%log_loss(y_test,y_pred_l))

Model fitted!
Prediction done!
Logloss for Linear Regression is: 0.30


In [76]:
# Random Forest
# user_logs included
model = LinearRegression()
model.fit(x_train_ul, y_train_ul)
print('Model fitted!')
y_pred_f = model.predict(x_test_ul)
print('Prediction done!')
print("Logloss for Random Forrest, incl. user_logs, is: %.2f"%log_loss(y_test_ul,y_pred_f))

#vs.
model = RandomForestRegressor()
model.fit(x_train_r, y_train_r)
print('Model fitted!')
y_pred_l = model.predict(x_test_r)
print('Prediction done!')
print("Logloss for Random Forrest, user_logs size but no user_logs, is: %.2f"%log_loss(y_test_r,y_pred_l))

Model fitted!
Prediction done!
Logloss for Random Forrest, incl. user_logs, is: 0.34
Model fitted!
Prediction done!
Logloss for Random Forrest, user_logs size but no user_logs, is: 0.52


In [116]:
# Run AdaBoost
model_abr = AdaBoostRegressor()
model_abr.fit(x_train, y_train)
y_pred_abr = model_abr.predict(x_test)
print("Logloss for AdaBoost is: %.2f"%log_loss(y_test,y_pred_abr))

Logloss for AdaBoost is: 0.30


In [117]:
# Run SGDRegressor (really bad)
model_sgd = SGDRegressor()
#model_sgd.fit(x_train, y_train)
#y_pred_sgd = model_sgd.predict(x_test)
print("Logloss for SGDRegressor is: %.2f"%log_loss(y_test,y_pred_sgd))



Logloss for AdaBoost is: 27.72


In [118]:
# Run GradientBoostingRegressor (very slow)
model_gbr = GradientBoostingRegressor()
#model_gbr.fit(x_train, y_train)
#y_pred_gbr = model_gbr.predict(x_test)
print("Logloss for GradientBoostingRegressor is: %.2f"%log_loss(y_test,y_pred_gbr))

Logloss for AdaBoost is: 0.30


In [None]:
# XG Boost
dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test, label = y_test)
print('Done.')
param = {
    'max_depth': 3,  # the maximum depth of each tree. Try with max_depth: 2 to 10.
    'eta': 0.3,  # the training step for each iteration. Try with ETA: 0.1, 0.2, 0.3...
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations. Try with num_round around few hundred!
#----------------
bst = xgb.train(param, dtrain, num_round)
print('Modeling done!')

y_pred_xgb = bst.predict(dtest)
print('Prediction done!')

best_preds = np.asarray([np.argmax(line) for line in y_pred_xgb])

y_pred_xgb = y_pred_xgb[:,1] #Column 2 out of 3

print("Logloss for XGBoost is: %.3f"%log_loss(y_test,y_pred_xgb))

### 3.2. Running the best model on all the data

In [49]:
# Linear Regression
model.fit(data_input, data_output)
y_pred_final = model.predict(test_input)
y_pred_final = np.absolute(y_pred_final)

## 4. Creating submission file

In [56]:
#Prepare submission file
my_submission = pd.DataFrame({'msno': test['msno'], 'is_churn': y_pred_final})
print(my_submission.head())
cols = my_submission.columns.tolist()
cols = cols[-1:] + cols[:-1]
my_submission = my_submission[cols]
print(my_submission.head())
print(my_submission.count())

my_submission.to_csv('submission.csv', index=False)
print('Done! :-)')

   is_churn                                          msno
0  0.061539  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=
1  0.122253  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=
2  0.125351  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=
3  0.058105  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=
4  0.055695  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=
                                           msno  is_churn
0  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=  0.061539
1  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=  0.122253
2  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=  0.125351
3  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=  0.058105
4  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=  0.055695
msno        907471
is_churn    907471
dtype: int64
Done! :-)
