In [8]:
#Load libraries and data
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor

train = pd.read_csv('data/train_v2.csv')
test = pd.read_csv('data/sample_submission_v2.csv')
transactions = pd.read_csv('data/transactions_v2.csv')
members = pd.read_csv('data/members_v3.csv')
print("Finished loading libraries and data.")

Finished loading libraries and data.


In [9]:
#Prepare test data
transactions = transactions[['msno','transaction_date','membership_expire_date', 'is_cancel']]

#Match 'msno' in transactions and training
pred_columns_merged = pd.merge(transactions, train, on='msno')

#Remove duplicates of 'msno' in prediction columns
pred_columns_dropped = pred_columns_merged.drop_duplicates(['msno'], keep='first')

#Size pred_columns (samples) to equal y_train (lables)
pred_columns_dropped = pred_columns_dropped.iloc[:,:970960]
print(pred_columns_dropped.count())

#Remove 'msno' so pred_columns_train has only numeric values
pred_columns_train = pred_columns_dropped.iloc[:,1:4]

#Prediction columns for testing
p_test_columns_merged = pd.merge(transactions, test, on='msno')
p_test_columns_dropped = p_columns_merged.drop_duplicates(['msno'], keep='first')


#Remove msno to have only numeric values
pred_columns_test = p_test_columns_dropped.iloc[:,1:4]

print(pred_columns_train.count())
print(pred_columns_test.count())

msno                      933578
transaction_date          933578
membership_expire_date    933578
is_cancel                 933578
is_churn                  933578
dtype: int64
transaction_date          933578
membership_expire_date    933578
is_cancel                 933578
dtype: int64
transaction_date          907470
membership_expire_date    907470
is_cancel                 907470
dtype: int64


In [10]:
#Split data into train and test sets

X_train = pred_columns_train
X_test = pred_columns_test
y_test = test.is_churn.iloc[:907470]
y_train = train.is_churn.iloc[:933578]

print("Size of X_train: ")
print(X_train.count())

print("Size of y_train: ")
print(y_train.count())

print("Size of y_test: ")
print(y_test.count())

print("Size of X_test:")
print(X_test.count())

# fit model, no training data
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Make predicitons for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

#Evaluate predictions

accuracy = accuracy_score(y_test, predictions)

print("Accuracy: %.2f%%" % (accuracy * 100.0))



Size of X_train: 
transaction_date          933578
membership_expire_date    933578
is_cancel                 933578
dtype: int64
Size of y_train: 
933578
Size of y_test: 
907470
Size of X_test:
transaction_date          907470
membership_expire_date    907470
is_cancel                 907470
dtype: int64
Accuracy: 99.66%


Size of X_train: 
transaction_date          933578
membership_expire_date    933578
is_cancel                 933578
dtype: int64
Size of y_train: 
933578
Size of y_pred: 
907470
Size of X_test:
transaction_date          907470
membership_expire_date    907470
is_cancel                 907470
dtype: int64


In [12]:
#Prepare submission file
msno = test.msno.iloc[:907470]
my_submission = pd.DataFrame({'msno': msno, 'is_churn': y_pred})

#Changing order of columns
cols = my_submission.columns.tolist()
cols = cols[-1:] + cols[:-1]
my_submission = my_submission[cols]


#my_submission.to_csv('submission.csv', index=False)
print("Submission file created.")

                                           msno  is_churn
0  4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=  1.000000
1  aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=  0.103463
2  rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=  0.086734
3  WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=  1.000000
4  aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=  0.086095
Hey
