# Algorithm Solution (for transactions)

## 1. Loading libraries

In [74]:
import numpy as np
import pandas as pd

# machine learning library
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor


#import XGBOOST Libraries
#import xgboost as xgb
from sklearn.grid_search import GridSearchCV

#Configure Panda
pd.options.display.width = 200

## 2. Loading and pre-processing data
### !!! ATTENTION: In order to load the following files you first need to completely run the latest versions of Feature_Engineering_Members.ipynb and Feature_Engineering_Transactions.ipynb. !!!
### 2.1 Load the files

In [75]:
#Load data in
train = pd.read_csv('data/train_v2.csv')
test = pd.read_csv('data/sample_submission_v2.csv')
transactions = pd.read_csv('data/final_transactions.csv')
final_members=pd.read_csv('data/final_members.csv')

MemoryError: 

### 2.2 Merge the different files

In [None]:
# Creating datasets witgh input&outputs
train_data = pd.merge(train,final_members,on='msno',how='left')
train_data = pd.merge(train_data,transactions,how='left',on='msno',left_index=True, right_index=True)

# Creating datasets with only inputs
# Note that the submission_v2.csv file does NOT contain ouputs (they are all 0)
test_data = pd.merge(test,final_members,on='msno',how='left')
test_data = pd.merge(test_data,transactions,how='left',on='msno',left_index=True, right_index=True)

#print(data.shape)

### 2.3 Data cleaning

In [None]:
#check for null values
print(train_data.isnull().sum())

In [None]:
#Get rid of null-values

#For train data
#Set city null values to o in train data
cities = ['city_1','city_3','city_4','city_5','city_6','city_7','city_8','city_9','city_10','city_11','city_12','city_13','city_14','city_15','city_16','city_17','city_18','city_19','city_20','city_21','city_22']
for i in range(0,len(cities)):
        inpt = cities[i]
        train_data[inpt]=train_data[inpt].fillna(value=0)
        test_data[inpt]=test_data[inpt].fillna(value=0)
        
reg_dates=['reg_year_2012','reg_year_2013','reg_year_2014','reg_year_2015','reg_year_2016','reg_year_2017']        
for i in range(0,len(reg_dates)):
        inpt = reg_dates[i]
        train_data[inpt]=train_data[inpt].fillna(value=0)
        test_data[inpt]=test_data[inpt].fillna(value=0)

reg_meth=['reg_3','reg_4','reg_7','reg_9','reg_11']
for i in range(0,len(reg_meth)):
        inpt = reg_meth[i]
        train_data[inpt]=train_data[inpt].fillna(value=0)
        test_data[inpt]=test_data[inpt].fillna(value=0)     
        
#train_data['bd_norm ']=train_data['bd_norm'].fillna(value=0)
#test_data['bd_norm']=test_data['bd_norm'].fillna(value=0)     

#Set registration dates null values to o in train data
#train_data[['reg_year_2017']] = train_data[['reg_year_2017']].fillna(value=0)
#test_data[['reg_year_2017']] = test_data[['reg_year_2017']].fillna(value=0)

#check for null values
print(train_data.isnull().sum())
print(test_data.isnull().sum())


In [None]:
#Drop unwanted columns (this may change with new insights or new code!)
unwanted = ['msno','actual_amount_paid','transaction_date','membership_expire_date', 'diff_plan_actual']
#Before dropping the msno of test, we need to save it for the sumission
msno = test.msno
train_data = train_data.drop(unwanted, axis=1)
test_data = test_data.drop(unwanted, axis=1)


In [None]:
# Splitting input/output data into train and test sets in order to check efficiency of our models
data_input = train_data.drop('is_churn',axis=1)
data_output = train_data['is_churn']
print(data_input.shape)
print(data_output.shape)
x_train, x_test, y_train, y_test = train_test_split(data_input, data_output, test_size=0.2, random_state=42)

# Removing is_churn (as it's all dummy zeros) from test data
test_input = test_data.drop('is_churn',axis=1)

## 3. Prediction models

### 4.1. Testing out prediction models

# Random Forrest, no training data
model = RandomForestRegressor()
model.fit(x_train, y_train)
print('Model fitted!')
y_pred_f = model.predict(x_test)
print('Prediction done!')
print("Logloss for Random Forrest is: %.2f"%log_loss(y_test,y_pred_f))

## I think we should use the logloss to measure our accuracy, as it is the same then they use on Kaggle. Also, to use accuracy, we need our input to be only 0's and 1's, thus it is not a very accurate assesment description of our model
#predictions = [round(value) for value in y_pred]
#accuracy = accuracy_score(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# Linear Regression
model = LinearRegression()
model.fit(x_train, y_train)
print('Model fitted!')
# Make predicitons for test data
y_pred_l = model.predict(x_test)
y_pred_l = np.absolute(y_pred_l)
print('Prediction done!')
#print("Logloss for Linear Regression is: %.2f"%log_loss(y_test,y_pred_l))

# Run AdaBoost
model_abr = AdaBoostRegressor()
model_abr.fit(x_train, y_train)
y_pred_abr = model_abr.predict(x_test)
print("Logloss for AdaBoost is: %.2f"%log_loss(y_test,y_pred_abr))

# Run SGDRegressor (really bad)
model_sgd = SGDRegressor()
#model_sgd.fit(x_train, y_train)
#y_pred_sgd = model_sgd.predict(x_test)
print("Logloss for SGDRegressor is: %.2f"%log_loss(y_test,y_pred_sgd))

# Run GradientBoostingRegressor (very slow)
model_gbr = GradientBoostingRegressor()
#model_gbr.fit(x_train, y_train)
#y_pred_gbr = model_gbr.predict(x_test)
print("Logloss for GradientBoostingRegressor is: %.2f"%log_loss(y_test,y_pred_gbr))

#XG Boost
dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test, label = y_test)
print('Done.')
param = {
    'max_depth': 3,  # the maximum depth of each tree. Try with max_depth: 2 to 10.
    'eta': 0.3,  # the training step for each iteration. Try with ETA: 0.1, 0.2, 0.3...
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations. Try with num_round around few hundred!
#----------------
bst = xgb.train(param, dtrain, num_round)
print('Modeling done!')

y_pred_xgb = bst.predict(dtest)
print('Prediction done!')

best_preds = np.asarray([np.argmax(line) for line in y_pred_xgb])

y_pred_xgb = y_pred_xgb[:,1] #Column 2 out of 3

from  sklearn.metrics import log_loss
print("Logloss for XGBoost is: %.3f"%log_loss(y_test,y_pred_xgb))

### 3.2. Running the best model on all the data

In [None]:
# Linear Regression
model.fit(data_input, data_output)
y_pred_l = model.predict(test_input)
y_pred_l = np.absolute(y_pred_l)

## 4. Creating submission file

In [None]:
#Prepare submission file
my_submission = pd.DataFrame({'msno': msno, 'is_churn': y_pred_l})
#think about changing y_pred_1 if you want to use a diffrenet algorithm
#y_pred_1 was one of the reasons for the previous bug
print(my_submission.head())
cols = my_submission.columns.tolist()
cols = cols[-1:] + cols[:-1]
my_submission = my_submission[cols]
print(my_submission.head())
print(my_submission.count())

my_submission.to_csv('submission.csv', index=False)
print('Done! :-)')

Comments for report:
-We should not consider the accuracy, because our set is imbalanced. 95% churns, which is why accuracy is a bad measure
-Cross-validation