# Algorithm Solution (for transactions)

## 1. Load libraries and data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor

#import XGBOOST Libraries
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

#Configure Panda
pd.options.display.width = 200

#Load data in
train = pd.read_csv('data/train_v2.csv')
test = pd.read_csv('data/sample_submission_v2.csv')
transactions = pd.read_csv('data/final_transactions.csv')



In [2]:
#import current modified verion (final_members.csv is generated from file Feature_engineering_Members, you have to run it first
#and it will automatically generate final_members.csv into the right folder.
#final_members is the feature engineered version of members
final_members=pd.read_csv('data/final_members.csv')
final_members.head()

Unnamed: 0,msno,city_1,city_3,city_4,city_5,city_6,city_7,city_8,city_9,city_10,...,city_13,city_14,city_15,city_16,city_17,city_18,city_19,city_20,city_21,city_22
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2. Merge the data from "final_transactions" and "final_members"

In [3]:
#Start with train data, merge it with final_members and transaction (left join prevents loss of data)
train = pd.merge(train,final_members,on='msno',how='left')
train = pd.merge(train,transactions,how='left',on='msno',left_index=True, right_index=True)

#Now merge test data together
test = pd.merge(test,final_members,on='msno',how='left')
test = pd.merge(test,transactions,how='left',on='msno',left_index=True, right_index=True)

#Get the shape of train and
print('Number of rows & columns TRAIN', train.shape)
print('Number of rows & columns TEST', test.shape)

Number of rows & columns TRAIN (970960, 147)
Number of rows & columns TEST (907471, 147)


In [4]:
train.head()
#70 COLUMNS SHOULD YOU GET

Unnamed: 0,msno,is_churn,city_1,city_3,city_4,city_5,city_6,city_7,city_8,city_9,...,plan_list_price_1000,plan_list_price_1150,plan_list_price_1200,plan_list_price_1260,plan_list_price_1299,plan_list_price_1300,plan_list_price_1399,plan_list_price_1599,plan_list_price_1788,plan_list_price_2000
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## 3. Deal with the data

In [5]:
#check for null values
train.isnull().sum()

msno                           0
is_churn                       0
city_1                    109993
city_3                    109993
city_4                    109993
city_5                    109993
city_6                    109993
city_7                    109993
city_8                    109993
city_9                    109993
city_10                   109993
city_11                   109993
city_12                   109993
city_13                   109993
city_14                   109993
city_15                   109993
city_16                   109993
city_17                   109993
city_18                   109993
city_19                   109993
city_20                   109993
city_21                   109993
city_22                   109993
actual_amount_paid             0
is_auto_renew                  0
transaction_date               0
membership_expire_date         0
is_cancel                      0
diff_plan_actual               0
discount                       0
          

Please change this conlusion if you add new code.
Conclusion: all the cities contain NULL values, and the easiest way to deal with those null-values is to set them to 0. As such, the algorithm has simply no information for the city in these rows

In [6]:
#Get rid of null-values

#For train data
train[['city_1','city_3','city_4','city_5']] = train[['city_1','city_3','city_4','city_5']].fillna(value=0)
train[['city_6','city_7','city_8','city_9','city_10']] = train[['city_6','city_7','city_8','city_9','city_10']].fillna(value=0)
train[['city_11','city_12','city_13','city_14','city_15']] = train[['city_11','city_12','city_13','city_14','city_15']].fillna(value=0)
train[['city_16','city_17','city_18','city_19','city_20']] = train[['city_16','city_17','city_18','city_19','city_20']].fillna(value=0)
train[['city_21','city_22']] = train[['city_21','city_22']].fillna(value=0)

#For test data
test[['city_1','city_3','city_4','city_5']] = test[['city_1','city_3','city_4','city_5']].fillna(value=0)
test[['city_6','city_7','city_8','city_9','city_10']] = test[['city_6','city_7','city_8','city_9','city_10']].fillna(value=0)
test[['city_11','city_12','city_13','city_14','city_15']] = test[['city_11','city_12','city_13','city_14','city_15']].fillna(value=0)
test[['city_16','city_17','city_18','city_19','city_20']] = test[['city_16','city_17','city_18','city_19','city_20']].fillna(value=0)
test[['city_21','city_22']] = test[['city_21','city_22']].fillna(value=0)

#check for null values
print(train.isnull().sum())
print(test.isnull().sum())

msno                      0
is_churn                  0
city_1                    0
city_3                    0
city_4                    0
city_5                    0
city_6                    0
city_7                    0
city_8                    0
city_9                    0
city_10                   0
city_11                   0
city_12                   0
city_13                   0
city_14                   0
city_15                   0
city_16                   0
city_17                   0
city_18                   0
city_19                   0
city_20                   0
city_21                   0
city_22                   0
actual_amount_paid        0
is_auto_renew             0
transaction_date          0
membership_expire_date    0
is_cancel                 0
diff_plan_actual          0
discount                  0
                         ..
plan_list_price_150       0
plan_list_price_180       0
plan_list_price_210       0
plan_list_price_265       0
plan_list_price_298 

In [7]:
#Drop unwanted columns (this may change with new insights or new code!)
unwanted = ['msno','actual_amount_paid','transaction_date','membership_expire_date', 'diff_plan_actual']
train = train.drop(unwanted, axis=1)

#Before dropping the msno of test, we need to save it for the sumission
msno = test.msno
test = test.drop(unwanted, axis=1)

## 4. ML Algorithm

In [8]:
#Split data for algorithm:

#y_train gets the churn values of train. On these values, our algorithm is trained
y_train = train.is_churn
#x_train gets all the other values without churn
x_train = train.drop('is_churn', axis=1)

#y_test gets the churn values of test. On these values, our algorithm is tested
y_test = test.is_churn
#x_test gets all the test values except for churn
x_test = test.drop('is_churn', axis=1)

#print out the sizes:
print("Size of X_train: ")
print(x_train.size)

print("Size of y_train: ")
print(y_train.size)

print("Size of y_test: ")
print(y_test.size)

print("Size of X_test:")
print(x_test.size)

Size of X_train: 
136905360
Size of y_train: 
970960
Size of y_test: 
907471
Size of X_test:
127953411


### 4.1. Fit model and make predictions

In [9]:
# Fit model, no training data
model = RandomForestRegressor()
model.fit(x_train, y_train)
print('Model fitted!')

# Make predicitons for test data
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]
print('Prediction done!')
#Evaluate predictions

accuracy = accuracy_score(y_test, predictions)

print("Accuracy: %.2f%%" % (accuracy * 100.0))

Model fitted!
Prediction done!
Accuracy: 99.98%


In [10]:
print("Size of X_train: ")
print(x_train.size)

print("Size of y_train: ")
print(y_train.size)

print("Size of y_pred: ")
print(y_pred.size)

print("Size of X_test:")
print(x_test.size)

Size of X_train: 
136905360
Size of y_train: 
970960
Size of y_pred: 
907471
Size of X_test:
127953411


In [11]:
#Prepare submission file

my_submission = pd.DataFrame({'msno': msno, 'is_churn': y_pred})
cols = my_submission.columns.tolist()
cols = cols[-1:] + cols[:-1]
my_submission = my_submission[cols]
print(my_submission.head())
print(my_submission.count())

my_submission.to_csv('submission.csv', index=False)
print('Done! :)')

                                           msno  is_churn
0  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=  0.059351
1  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=  0.127405
2  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=  0.132093
3  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=  0.049591
4  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=  0.064351
msno        907471
is_churn    907471
dtype: int64
Done! :)
