### This Notebook contains code for creating dataset 2 for churn prediction analysis, having features based on  user logs defined monthwise, and those based on transaction logs defined in aggregate, both for a period of 6 months before the churn prediction period (March 2017). Since the prediction period is the month of March, we consider users whose subscription was due to expire in Feb 2017, in accordance with our definition of churn

In [1]:
import numpy as np
import pandas as pd

In [2]:
# read the pre-processed user logs file
user_log_reader = pd.read_csv("user_logs_initial.csv")

In [3]:
# rename index column (since this is the user identifier, msno)
user_log_reader.rename(columns={'Unnamed: 0':'msno'}, inplace=True)

In [4]:
user_log_reader.head()

Unnamed: 0,msno,num_25_mean_201702,num_25_std_201702,num_50_mean_201702,num_50_std_201702,num_75_mean_201702,num_75_std_201702,num_985_mean_201702,num_985_std_201702,num_100_mean_201702,...,num_75_std_201609,num_985_mean_201609,num_985_std_201609,num_100_mean_201609,num_100_std_201609,num_unq_mean_201609,num_unq_std_201609,total_secs_mean_201609,total_secs_std_201609,count_201609
0,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.041452,1.0,0.816497,24.428571,28.395171,25.571429,24.589003,7142.395857,7860.812942,7.0
1,+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,1.117647,1.111438,0.352941,0.606339,0.235294,0.437237,0.764706,0.903425,52.235294,...,0.833809,0.466667,0.743223,49.266667,42.58683,41.466667,33.088338,12884.733733,11080.711388,15.0
2,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,4.296296,2.971853,2.259259,1.508759,2.148148,1.536822,4.222222,3.714145,17.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,1.035714,1.231745,0.607143,0.785955,0.535714,0.792658,0.785714,1.133893,27.214286,...,1.94773,0.678571,1.722478,23.75,18.27288,25.25,19.835667,6193.243893,4516.683608,28.0
4,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,9.4,8.022337,4.0,4.518966,2.3,2.202869,2.3,2.473012,19.15,...,2.323509,1.333333,2.0,23.666667,21.7066,31.111111,23.252802,6567.140722,5605.206095,18.0


In [5]:
# read the pre-processed file, containing churn labels for users whose subscription was due to expire in Feb 2017
user_target_expiration = pd.read_csv("train_v2_final_201702.csv")

In [12]:
# class imbalance between users that churn and those that didn't
user_target_expiration.groupby(['is_churn']).agg({'msno': 'count'})

Unnamed: 0_level_0,msno
is_churn,Unnamed: 1_level_1
0,941182
1,38997


In [13]:
user_target_expiration.groupby(['is_churn']).agg({'msno': 'count'}).apply(lambda x: x / x.sum() * 100)

Unnamed: 0_level_0,msno
is_churn,Unnamed: 1_level_1
0,96.021441
1,3.978559


In [7]:
# get user logs for the target users
user_log_reader_reduced = user_log_reader.merge(user_target_expiration, on = 'msno', how = 'inner')

In [10]:
user_log_reader_reduced.head()

Unnamed: 0,msno,num_25_mean_201702,num_25_std_201702,num_50_mean_201702,num_50_std_201702,num_75_mean_201702,num_75_std_201702,num_985_mean_201702,num_985_std_201702,num_100_mean_201702,...,num_985_mean_201609,num_985_std_201609,num_100_mean_201609,num_100_std_201609,num_unq_mean_201609,num_unq_std_201609,total_secs_mean_201609,total_secs_std_201609,count_201609,is_churn
0,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,4.296296,2.971853,2.259259,1.508759,2.148148,1.536822,4.222222,3.714145,17.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,1.035714,1.231745,0.607143,0.785955,0.535714,0.792658,0.785714,1.133893,27.214286,...,0.678571,1.722478,23.75,18.27288,25.25,19.835667,6193.243893,4516.683608,28.0,0
2,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,9.4,8.022337,4.0,4.518966,2.3,2.202869,2.3,2.473012,19.15,...,1.333333,2.0,23.666667,21.7066,31.111111,23.252802,6567.140722,5605.206095,18.0,0
3,++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,4.12,5.134199,1.88,2.006656,1.08,1.351542,1.28,1.720465,10.32,...,2.407407,2.22329,25.185185,26.31618,21.148148,20.045532,7574.273037,7119.379358,27.0,0
4,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,3.222222,3.655062,1.277778,1.872575,0.833333,0.985184,0.333333,0.594089,9.555556,...,0.6,0.816497,5.4,3.341656,9.6,5.937171,1577.73504,962.563581,25.0,0


In [14]:
# write the results to a file, to reclaim memory hereafter
user_log_reader_reduced.to_csv("user_logs_target_expiration.csv", index = False)

In [3]:
user_target_expiration_reader = pd.read_csv("user_logs_target_expiration.csv")

In [4]:
user_target_expiration_reader.head()

Unnamed: 0,msno,num_25_mean_201702,num_25_std_201702,num_50_mean_201702,num_50_std_201702,num_75_mean_201702,num_75_std_201702,num_985_mean_201702,num_985_std_201702,num_100_mean_201702,...,num_985_mean_201609,num_985_std_201609,num_100_mean_201609,num_100_std_201609,num_unq_mean_201609,num_unq_std_201609,total_secs_mean_201609,total_secs_std_201609,count_201609,is_churn
0,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,4.296296,2.971853,2.259259,1.508759,2.148148,1.536822,4.222222,3.714145,17.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,1.035714,1.231745,0.607143,0.785955,0.535714,0.792658,0.785714,1.133893,27.214286,...,0.678571,1.722478,23.75,18.27288,25.25,19.835667,6193.243893,4516.683608,28.0,0
2,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,9.4,8.022337,4.0,4.518966,2.3,2.202869,2.3,2.473012,19.15,...,1.333333,2.0,23.666667,21.7066,31.111111,23.252802,6567.140722,5605.206095,18.0,0
3,++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,4.12,5.134199,1.88,2.006656,1.08,1.351542,1.28,1.720465,10.32,...,2.407407,2.22329,25.185185,26.31618,21.148148,20.045532,7574.273037,7119.379358,27.0,0
4,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,3.222222,3.655062,1.277778,1.872575,0.833333,0.985184,0.333333,0.594089,9.555556,...,0.6,0.816497,5.4,3.341656,9.6,5.937171,1577.73504,962.563581,25.0,0


In [5]:
# read the transaction logs file
transaction_log_reader = pd.read_csv("transaction_logs_initial1.csv")

In [6]:
transaction_log_reader.columns

Index(['msno', 'payment_plan_days_mean', 'actual_amount_paid_mean',
       'plan_list_price_mean', 'payment_counts', 'latest_payment_method_id',
       'latest_payment_plan_days', 'latest_plan_list_price_mean',
       'latest_actual_amount_paid', 'latest_is_auto_renew',
       'latest_trnsaction_date', 'latest_membership_expire_date',
       'latest_is_cancel'],
      dtype='object')

In [7]:
transaction_log_reader.head()

Unnamed: 0,msno,payment_plan_days_mean,actual_amount_paid_mean,plan_list_price_mean,payment_counts,latest_payment_method_id,latest_payment_plan_days,latest_plan_list_price_mean,latest_actual_amount_paid,latest_is_auto_renew,latest_trnsaction_date,latest_membership_expire_date,latest_is_cancel
0,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,7.0,0.0,0.0,1,35,7,0,0,0,20160909,20160914,0
1,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,7.0,0.0,0.0,1,35,7,0,0,0,20160909,20160914,0
2,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,7.0,0.0,0.0,1,35,7,0,0,0,20160909,20160914,0
3,+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,395.0,1599.0,1599.0,1,22,395,1599,1599,0,20161023,20180206,0
4,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,30.0,99.0,99.0,4,41,30,99,99,1,20170215,20170315,0


In [8]:
# join with transactiton logs data
combined_logs_df = user_target_expiration_reader.merge(transaction_log_reader, on = 'msno', how = 'inner')

In [9]:
# make temporary file for this data, so that there is limited data stored in memory
combined_logs_df.to_csv("combined_logs_initial.csv", index = False)

In [2]:
# read the csv file
combined_logs_reader = pd.read_csv("combined_logs_initial.csv")

KeyboardInterrupt: 

In [None]:
# read the pre-processed membership details file
members_pre_reader = pd.read_csv("members_pre.csv")

In [None]:
# merge member demographics into the existing data
analysis_dataset = combined_logs_reader.merge(members_pre_reader, on = 'msno', how = 'left')

In [None]:
# check if we have the expected number of dimensions
analysis_data.shape

In [None]:
# remove columns generated from indexes
analysis_data = analysis_data[analysis_data.columns.difference(['Unnamed: 0', 'Unnamed: 0.1'])]

In [None]:
# write the data to csv
analysis_data.to_csv("analysis_dataset2.csv", index = False)