# Algorithm Solution (for transactions)

## 1. Loading libraries

In [115]:
import numpy as np
import pandas as pd

# machine learning library
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor


#import XGBOOST Libraries
#import xgboost as xgb
from sklearn.grid_search import GridSearchCV

#Configure Panda
pd.options.display.width = 200

## 2. Loading and pre-processing data
### !!! ATTENTION: In order to load the following files you first need to completely run Feature_Engineering_Members.ipynb and Feature_Engineering_Transactions.ipynb. !!!
### 2.1 Load the files

In [87]:
#Load data in
train = pd.read_csv('data/train_v2.csv')
test = pd.read_csv('data/sample_submission_v2.csv')
transactions = pd.read_csv('data/final_transactions.csv')
final_members=pd.read_csv('data/final_members.csv')

Unnamed: 0,msno,city_1,city_3,city_4,city_5,city_6,city_7,city_8,city_9,city_10,...,city_15,city_16,city_17,city_18,city_19,city_20,city_21,city_22,reg_name,reg_month
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2011,9
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2011,9
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2011,9
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2011,9
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2011,9


### 2.2 Merge the different files

In [106]:
# Creating datasets witgh input&outputs
train_data = pd.merge(train,final_members,on='msno',how='left')
train_data = pd.merge(train_data,transactions,how='left',on='msno',left_index=True, right_index=True)

# Creating datasets with only inputs
# Note that the submission_v2.csv file does NOT contain ouputs (they are all 0)
test_data = pd.merge(test,final_members,on='msno',how='left')
test_data = pd.merge(test_data,transactions,how='left',on='msno',left_index=True, right_index=True)

print(data.shape)

(970960, 143)


### 2.3 Data cleaning

In [107]:
#check for null values
print(train_data.isnull().sum())

msno                           0
is_churn                       0
city_1                    109993
city_3                    109993
city_4                    109993
city_5                    109993
city_6                    109993
city_7                    109993
city_8                    109993
city_9                    109993
city_10                   109993
city_11                   109993
city_12                   109993
city_13                   109993
city_14                   109993
city_15                   109993
city_16                   109993
city_17                   109993
city_18                   109993
city_19                   109993
city_20                   109993
city_21                   109993
city_22                   109993
reg_name                  109993
reg_month                 109993
actual_amount_paid             0
is_auto_renew                  0
transaction_date               0
membership_expire_date         0
is_cancel                      0
          

In [109]:
#Get rid of null-values

#For train data
#Set city null values to o in train data
cities = ['city_1','city_3','city_4','city_5','city_6','city_7','city_8','city_9','city_10','city_11','city_12','city_13','city_14','city_15','city_16','city_17','city_18','city_19','city_20','city_21','city_22']
for i in range(0,len(cities)):
        inpt = cities[i]
        train_data[inpt]=train_data[inpt].fillna(value=0)
        test_data[inpt]=test_data[inpt].fillna(value=0)

#Set registration dates null values to o in train data
train_data[['reg_name','reg_month']] = train_data[['reg_name','reg_month']].fillna(value=0)
test_data[['reg_name','reg_month']] = test_data[['reg_name','reg_month']].fillna(value=0)

#check for null values
#print(train_data.isnull().sum())
#print(test_data.isnull().sum())


In [110]:
#Drop unwanted columns (this may change with new insights or new code!)
unwanted = ['msno','actual_amount_paid','transaction_date','membership_expire_date', 'diff_plan_actual','reg_name','reg_name']
train_data = train_data.drop(unwanted, axis=1)
test_data = test_data.drop(unwanted, axis=1)


In [119]:
# Splitting input/output data into train and test sets in order to check efficiency of our models
data_input = train_data.drop('is_churn',axis=1)
data_output = train_data['is_churn']
print(data_input.shape)
print(data_output.shape)
x_train, x_test, y_train, y_test = train_test_split(data_input, data_output, test_size=0.2, random_state=42)

# Removing is_churn (as it's all dummy zeros) from test data
test_input = test_data.drop('is_churn',axis=1)

(970960, 142)
(970960,)


## 3. Prediction models

### 4.1. Testing out prediction models

In [112]:
# Random Forrest, no training data
model = RandomForestRegressor()
model.fit(x_train, y_train)
print('Model fitted!')
y_pred_f = model.predict(x_test)
print('Prediction done!')
print("Logloss for Random Forrest is: %.2f"%log_loss(y_test,y_pred_f))

## I think we should use the logloss to measure our accuracy, as it is the same then they use on Kaggle. Also, to use accuracy, we need our input to be only 0's and 1's, thus it is not a very accurate assesment description of our model
#predictions = [round(value) for value in y_pred]
#accuracy = accuracy_score(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

Model fitted!
Prediction done!
Logloss is: 0.40


In [114]:
# Linear Regression
model = LinearRegression()
model.fit(x_train, y_train)
print('Model fitted!')
# Make predicitons for test data
y_pred_l = model.predict(x_test)
y_pred_l = np.absolute(y_pred_l)
print('Prediction done!')
print("Logloss for Linear Regression is: %.2f"%log_loss(y_test,y_pred_l))

Model fitted!
[0.13832868 0.05314634 0.13706256 ... 0.13219336 0.07228248 0.05326477]
Prediction done!
Logloss for Linear Regression is: 0.30


In [116]:
# Run AdaBoost
model_abr = AdaBoostRegressor()
model_abr.fit(x_train, y_train)
y_pred_abr = model_abr.predict(x_test)
print("Logloss for AdaBoost is: %.2f"%log_loss(y_test,y_pred_abr))

Logloss for AdaBoost is: 0.30


In [117]:
# Run SGDRegressor (really bad)
model_sgd = SGDRegressor()
#model_sgd.fit(x_train, y_train)
#y_pred_sgd = model_sgd.predict(x_test)
print("Logloss for SGDRegressor is: %.2f"%log_loss(y_test,y_pred_sgd))



Logloss for AdaBoost is: 27.72


In [118]:
# Run GradientBoostingRegressor (very slow)
model_gbr = GradientBoostingRegressor()
#model_gbr.fit(x_train, y_train)
#y_pred_gbr = model_gbr.predict(x_test)
print("Logloss for GradientBoostingRegressor is: %.2f"%log_loss(y_test,y_pred_gbr))

Logloss for AdaBoost is: 0.30


### 3.2. Running the best model on all the data

In [120]:
# Linear Regression
model.fit(data_input, data_output)
y_pred_l = model.predict(test_input)
y_pred_l = np.absolute(y_pred_l)

## 4. Creating submission file

In [122]:
#Prepare submission file
my_submission = pd.DataFrame({'msno': msno, 'is_churn': y_pred_l})
print(my_submission.head())
cols = my_submission.columns.tolist()
cols = cols[-1:] + cols[:-1]
my_submission = my_submission[cols]
print(my_submission.head())
print(my_submission.count())

my_submission.to_csv('submission.csv', index=False)
print('Done! :-)')

   is_churn                                          msno
0  0.061539  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=
1  0.122253  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=
2  0.125351  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=
3  0.058105  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=
4  0.055695  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=
                                           msno  is_churn
0  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=  0.061539
1  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=  0.122253
2  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=  0.125351
3  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=  0.058105
4  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=  0.055695
msno        907471
is_churn    907471
dtype: int64
Done! :-)
