In [1]:
import pandas as pd
import warnings

warnings.simplefilter("ignore")

DATA_PATH: str = "data/5ch"

uselog_months = pd.read_csv(f"{DATA_PATH}/use_log_months.csv")
customer = pd.read_csv(f"{DATA_PATH}/customer_join.csv")

In [2]:
uselog_months

Unnamed: 0,연월,customer_id,count
0,201804,AS002855,4
1,201804,AS009013,2
2,201804,AS009373,3
3,201804,AS015315,6
4,201804,AS015739,7
...,...,...,...
36837,201903,TS995853,8
36838,201903,TS998593,8
36839,201903,TS999079,3
36840,201903,TS999231,6


In [3]:
customer

Unnamed: 0,customer_id,name,class,gender,start_date,end_date,campaign_id,is_deleted,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,OA832399,XXXX,C01,F,2015-05-01,,CA1,0,종일,10500,일반,4.833333,5.0,8,2,1,2019-04-30,47
1,PL270116,XXXXX,C01,M,2015-05-01,,CA1,0,종일,10500,일반,5.083333,5.0,7,3,1,2019-04-30,47
2,OA974876,XXXXX,C01,M,2015-05-01,,CA1,0,종일,10500,일반,4.583333,5.0,6,3,1,2019-04-30,47
3,HD024127,XXXXX,C01,F,2015-05-01,,CA1,0,종일,10500,일반,4.833333,4.5,7,2,1,2019-04-30,47
4,HD661448,XXXXX,C03,F,2015-05-01,,CA1,0,야간,6000,일반,3.916667,4.0,6,1,1,2019-04-30,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4187,HD676663,XXXX,C01,M,2019-03-14,,CA1,0,종일,10500,일반,8.000000,8.0,8,8,0,2019-04-30,1
4188,HD246549,XXXXX,C01,F,2019-03-14,,CA1,0,종일,10500,일반,10.000000,10.0,10,10,0,2019-04-30,1
4189,GD037007,XXXXX,C03,M,2019-03-14,,CA1,0,야간,6000,일반,8.000000,8.0,8,8,0,2019-04-30,1
4190,OA953150,XXXXX,C01,M,2019-03-14,,CA1,0,종일,10500,일반,11.000000,11.0,11,11,0,2019-04-30,1


In [4]:
uselog_months.isnull().sum()

연월             0
customer_id    0
count          0
dtype: int64

In [5]:
customer.isnull().sum()

customer_id             0
name                    0
class                   0
gender                  0
start_date              0
end_date             2842
campaign_id             0
is_deleted              0
class_name              0
price                   0
campaign_name           0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
calc_date               0
membership_period       0
dtype: int64

### What Makes Quitting Membership?

- First, extract all members information with visiting times this month and last month
- Second, extract all quiited member with visiting times at quit month and previous month

In [6]:
# First Process
year_month = list(uselog_months["연월"].unique())
uselog = pd.DataFrame()
for i in range(1, len(year_month)):
    tmp = uselog_months[uselog_months["연월"] == year_month[i]]
    tmp_add = uselog_months[uselog_months["연월"] == year_month[i-1]]
    del tmp_add["연월"]
    tmp_add.rename(columns={"count": "cnt_last_mon"}, inplace=True)
    tmp = pd.merge(tmp, tmp_add, on="customer_id", how="left")
    uselog = pd.concat([uselog, tmp], ignore_index=False)

uselog

Unnamed: 0,연월,customer_id,count,cnt_last_mon
0,201805,AS002855,5,4.0
1,201805,AS009373,4,3.0
2,201805,AS015233,7,
3,201805,AS015315,3,6.0
4,201805,AS015739,5,7.0
...,...,...,...,...
2948,201903,TS995853,8,11.0
2949,201903,TS998593,8,7.0
2950,201903,TS999079,3,2.0
2951,201903,TS999231,6,6.0


In [7]:
# Second Process
from dateutil.relativedelta import relativedelta

exit_customer = customer[customer["is_deleted"] == 1]
exit_customer["exit_date"] = None
exit_customer["end_date"] = pd.to_datetime(exit_customer["end_date"])
for i in range(len(exit_customer)):
    exit_customer["exit_date"].iloc[i] = exit_customer["end_date"].iloc[i] - relativedelta(months=1)

exit_customer["연월"] = exit_customer["end_date"].dt.strftime("%Y%m")

In [8]:
uselog.dtypes

연월                int64
customer_id      object
count             int64
cnt_last_mon    float64
dtype: object

In [9]:
uselog["연월"] = uselog["연월"].astype(str)
exit_uselog = pd.merge(uselog, exit_customer, on=["customer_id", "연월"], how="left")
exit_uselog

Unnamed: 0,연월,customer_id,count,cnt_last_mon,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201805,AS002855,5,4.0,,,,,NaT,,...,,,,,,,,,,
1,201805,AS009373,4,3.0,,,,,NaT,,...,,,,,,,,,,
2,201805,AS015233,7,,,,,,NaT,,...,,,,,,,,,,
3,201805,AS015315,3,6.0,,,,,NaT,,...,,,,,,,,,,
4,201805,AS015739,5,7.0,,,,,NaT,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33846,201903,TS995853,8,11.0,,,,,NaT,,...,,,,,,,,,,
33847,201903,TS998593,8,7.0,,,,,NaT,,...,,,,,,,,,,
33848,201903,TS999079,3,2.0,,,,,NaT,,...,,,,,,,,,,
33849,201903,TS999231,6,6.0,,,,,NaT,,...,,,,,,,,,,


In [10]:
exit_uselog.isnull().sum()

연월                       0
customer_id              0
count                    0
cnt_last_mon          1201
name                 32622
class                32622
gender               32622
start_date           32622
end_date             32622
campaign_id          32622
is_deleted           32622
class_name           32622
price                32622
campaign_name        32622
mean                 32622
median               32622
max                  32622
min                  32622
routine_flg          32622
calc_date            32622
membership_period    32622
exit_date            32622
dtype: int64

In [11]:
exit_uselog.dropna(subset=["name"], inplace=True)

# Information for quitted members with cnt_last_mon which is a previous month of quitting month.
exit_uselog

Unnamed: 0,연월,customer_id,count,cnt_last_mon,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
10,201805,AS030404,1,1.0,XXXX,C01,M,2017-05-01,2018-05-31,CA1,...,10500.0,일반,1.00,1.0,1.0,1.0,0.0,2018-05-31,12.0,2018-04-30 00:00:00
74,201805,AS206541,4,1.0,XXXX,C03,M,2018-01-01,2018-05-31,CA1,...,6000.0,일반,2.50,2.5,4.0,1.0,1.0,2018-05-31,4.0,2018-04-30 00:00:00
125,201805,AS354246,6,8.0,XXXXX,C01,M,2018-04-13,2018-05-31,CA3,...,10500.0,입회비무료,7.00,7.0,8.0,6.0,0.0,2018-05-31,1.0,2018-04-30 00:00:00
135,201805,AS383234,2,4.0,XXXXXX,C02,M,2016-12-01,2018-05-31,CA1,...,7500.0,일반,3.00,3.0,4.0,2.0,0.0,2018-05-31,17.0,2018-04-30 00:00:00
174,201805,AS480037,1,2.0,XXXX,C02,F,2017-09-01,2018-05-31,CA1,...,7500.0,일반,1.50,1.5,2.0,1.0,0.0,2018-05-31,8.0,2018-04-30 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33717,201903,TS645212,1,4.0,XXXX,C03,F,2018-03-01,2019-03-31,CA1,...,6000.0,일반,4.50,4.5,7.0,1.0,0.0,2019-03-31,12.0,2019-02-28 00:00:00
33745,201903,TS741703,6,5.0,XXXX,C03,M,2018-12-08,2019-03-31,CA3,...,6000.0,입회비무료,6.25,6.0,8.0,5.0,0.0,2019-03-31,3.0,2019-02-28 00:00:00
33782,201903,TS859258,1,1.0,XXXXX,C02,F,2018-12-07,2019-03-31,CA3,...,7500.0,입회비무료,2.50,2.0,5.0,1.0,0.0,2019-03-31,3.0,2019-02-28 00:00:00
33798,201903,TS886985,3,5.0,XXX,C02,F,2018-03-01,2019-03-31,CA1,...,7500.0,일반,4.25,4.0,7.0,2.0,1.0,2019-03-31,12.0,2019-02-28 00:00:00


#### Analysis

- At glance, visiting time on the month when the user quits membership and visiting times on previous month does not have any relationship with exiting membership.

#### Then, How about Current Members?

In [12]:
stay_customer = customer[customer["is_deleted"] == 0]
stay_uselog = pd.merge(uselog, stay_customer, on="customer_id", how="left")
stay_uselog.dropna(subset=["name"], inplace=True)
stay_uselog

Unnamed: 0,연월,customer_id,count,cnt_last_mon,name,class,gender,start_date,end_date,campaign_id,...,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,201805,AS002855,5,4.0,XXXX,C03,F,2016-11-01,,CA1,...,야간,6000.0,일반,4.500000,5.0,7.0,2.0,1.0,2019-04-30,29.0
1,201805,AS009373,4,3.0,XX,C01,F,2015-11-01,,CA1,...,종일,10500.0,일반,5.083333,5.0,7.0,3.0,1.0,2019-04-30,41.0
2,201805,AS015233,7,,XXXXX,C01,M,2018-05-13,,CA2,...,종일,10500.0,입회비반액할인,7.545455,7.0,11.0,4.0,1.0,2019-04-30,11.0
3,201805,AS015315,3,6.0,XXXXX,C01,M,2015-07-01,,CA1,...,종일,10500.0,일반,4.833333,5.0,7.0,3.0,1.0,2019-04-30,45.0
4,201805,AS015739,5,7.0,XXXXX,C03,M,2017-06-01,,CA1,...,야간,6000.0,일반,5.583333,5.5,8.0,4.0,1.0,2019-04-30,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33846,201903,TS995853,8,11.0,XXXX,C01,M,2019-02-08,,CA1,...,종일,10500.0,일반,9.500000,9.5,11.0,8.0,1.0,2019-04-30,2.0
33847,201903,TS998593,8,7.0,XXXXX,C03,M,2018-09-01,,CA1,...,야간,6000.0,일반,8.142857,8.0,9.0,7.0,1.0,2019-04-30,7.0
33848,201903,TS999079,3,2.0,XXX,C03,M,2016-06-01,,CA1,...,야간,6000.0,일반,4.916667,5.5,9.0,2.0,1.0,2019-04-30,34.0
33849,201903,TS999231,6,6.0,XXXX,C01,M,2017-03-01,,CA1,...,종일,10500.0,일반,4.666667,5.0,8.0,1.0,1.0,2019-04-30,25.0


- There is unbalance numbers of data between quit members(1229) and current members(27422)
- can not use data for learning.
  => remove duplicate customer_id in stay_uselog

In [13]:
print(len(stay_uselog["customer_id"].unique()))

2842


In [14]:
# remake stay_uselog 
# mixing current data in random
stay_uselog = stay_uselog.sample(frac=1).reset_index(drop=True)
stay_uselog

Unnamed: 0,연월,customer_id,count,cnt_last_mon,name,class,gender,start_date,end_date,campaign_id,...,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,201805,OA576440,4,6.0,XXXXX,C01,F,2016-09-01,,CA1,...,종일,10500.0,일반,4.666667,4.5,7.0,3.0,1.0,2019-04-30,31.0
1,201805,OA822905,4,5.0,XXXX,C01,F,2016-07-01,,CA2,...,종일,10500.0,입회비반액할인,5.333333,6.0,7.0,3.0,1.0,2019-04-30,33.0
2,201812,GD620805,4,5.0,XXX,C01,F,2016-10-01,,CA1,...,종일,10500.0,일반,5.250000,5.0,7.0,3.0,1.0,2019-04-30,30.0
3,201806,PL783702,6,3.0,XXXXX,C03,F,2016-12-01,,CA1,...,야간,6000.0,일반,4.166667,4.5,6.0,2.0,1.0,2019-04-30,28.0
4,201812,PL863680,5,6.0,XXXXX,C01,M,2016-12-01,,CA1,...,종일,10500.0,일반,5.500000,5.5,8.0,3.0,1.0,2019-04-30,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27417,201811,IK266002,3,5.0,XXXX,C01,M,2015-09-01,,CA1,...,종일,10500.0,일반,4.416667,5.0,7.0,1.0,1.0,2019-04-30,43.0
27418,201812,GD998772,5,6.0,XXX,C01,F,2017-01-01,,CA1,...,종일,10500.0,일반,5.833333,6.0,9.0,2.0,1.0,2019-04-30,27.0
27419,201810,HI957106,6,5.0,XXXX,C03,M,2017-01-01,,CA1,...,야간,6000.0,일반,5.500000,6.0,7.0,4.0,1.0,2019-04-30,27.0
27420,201809,OA909231,9,8.0,XX,C03,M,2018-01-01,,CA1,...,야간,6000.0,일반,7.083333,7.0,12.0,3.0,1.0,2019-04-30,15.0


In [15]:
# select one sample between duplicated customer_id in stay_uselog
stay_uselog = stay_uselog.drop_duplicates(subset="customer_id")
stay_uselog

Unnamed: 0,연월,customer_id,count,cnt_last_mon,name,class,gender,start_date,end_date,campaign_id,...,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,201805,OA576440,4,6.0,XXXXX,C01,F,2016-09-01,,CA1,...,종일,10500.0,일반,4.666667,4.5,7.0,3.0,1.0,2019-04-30,31.0
1,201805,OA822905,4,5.0,XXXX,C01,F,2016-07-01,,CA2,...,종일,10500.0,입회비반액할인,5.333333,6.0,7.0,3.0,1.0,2019-04-30,33.0
2,201812,GD620805,4,5.0,XXX,C01,F,2016-10-01,,CA1,...,종일,10500.0,일반,5.250000,5.0,7.0,3.0,1.0,2019-04-30,30.0
3,201806,PL783702,6,3.0,XXXXX,C03,F,2016-12-01,,CA1,...,야간,6000.0,일반,4.166667,4.5,6.0,2.0,1.0,2019-04-30,28.0
4,201812,PL863680,5,6.0,XXXXX,C01,M,2016-12-01,,CA1,...,종일,10500.0,일반,5.500000,5.5,8.0,3.0,1.0,2019-04-30,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25457,201903,PL545617,9,,XX,C01,F,2019-03-01,,CA1,...,종일,10500.0,일반,9.000000,9.0,9.0,9.0,1.0,2019-04-30,1.0
25491,201903,TS878534,11,,XXXXX,C03,M,2019-03-12,,CA1,...,야간,6000.0,일반,11.000000,11.0,11.0,11.0,0.0,2019-04-30,1.0
25651,201903,IK259509,10,,XXXX,C01,F,2019-03-06,,CA1,...,종일,10500.0,일반,10.000000,10.0,10.0,10.0,1.0,2019-04-30,1.0
26333,201903,GD625169,8,,XXXX,C03,F,2019-03-06,,CA1,...,야간,6000.0,일반,8.000000,8.0,8.0,8.0,0.0,2019-04-30,1.0


#### Merge quit member + stay member information for supervisor learning

In [16]:
predict_data = pd.concat([exit_uselog, stay_uselog], ignore_index=True)
predict_data

Unnamed: 0,연월,customer_id,count,cnt_last_mon,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201805,AS030404,1,1.0,XXXX,C01,M,2017-05-01,2018-05-31,CA1,...,10500.0,일반,1.0,1.0,1.0,1.0,0.0,2018-05-31,12.0,2018-04-30 00:00:00
1,201805,AS206541,4,1.0,XXXX,C03,M,2018-01-01,2018-05-31,CA1,...,6000.0,일반,2.5,2.5,4.0,1.0,1.0,2018-05-31,4.0,2018-04-30 00:00:00
2,201805,AS354246,6,8.0,XXXXX,C01,M,2018-04-13,2018-05-31,CA3,...,10500.0,입회비무료,7.0,7.0,8.0,6.0,0.0,2018-05-31,1.0,2018-04-30 00:00:00
3,201805,AS383234,2,4.0,XXXXXX,C02,M,2016-12-01,2018-05-31,CA1,...,7500.0,일반,3.0,3.0,4.0,2.0,0.0,2018-05-31,17.0,2018-04-30 00:00:00
4,201805,AS480037,1,2.0,XXXX,C02,F,2017-09-01,2018-05-31,CA1,...,7500.0,일반,1.5,1.5,2.0,1.0,0.0,2018-05-31,8.0,2018-04-30 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4066,201903,PL545617,9,,XX,C01,F,2019-03-01,NaT,CA1,...,10500.0,일반,9.0,9.0,9.0,9.0,1.0,2019-04-30,1.0,
4067,201903,TS878534,11,,XXXXX,C03,M,2019-03-12,NaT,CA1,...,6000.0,일반,11.0,11.0,11.0,11.0,0.0,2019-04-30,1.0,
4068,201903,IK259509,10,,XXXX,C01,F,2019-03-06,NaT,CA1,...,10500.0,일반,10.0,10.0,10.0,10.0,1.0,2019-04-30,1.0,
4069,201903,GD625169,8,,XXXX,C03,F,2019-03-06,NaT,CA1,...,6000.0,일반,8.0,8.0,8.0,8.0,0.0,2019-04-30,1.0,


In [17]:
# add variable 'period' for Time Series Analysis
predict_data["period"] = 0

predict_data["start_date"] = pd.to_datetime(predict_data["start_date"])
predict_data["date_now"] = pd.to_datetime(predict_data["연월"], format="%Y%m")
predict_data

Unnamed: 0,연월,customer_id,count,cnt_last_mon,name,class,gender,start_date,end_date,campaign_id,...,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date,period,date_now
0,201805,AS030404,1,1.0,XXXX,C01,M,2017-05-01,2018-05-31,CA1,...,1.0,1.0,1.0,1.0,0.0,2018-05-31,12.0,2018-04-30 00:00:00,0,2018-05-01
1,201805,AS206541,4,1.0,XXXX,C03,M,2018-01-01,2018-05-31,CA1,...,2.5,2.5,4.0,1.0,1.0,2018-05-31,4.0,2018-04-30 00:00:00,0,2018-05-01
2,201805,AS354246,6,8.0,XXXXX,C01,M,2018-04-13,2018-05-31,CA3,...,7.0,7.0,8.0,6.0,0.0,2018-05-31,1.0,2018-04-30 00:00:00,0,2018-05-01
3,201805,AS383234,2,4.0,XXXXXX,C02,M,2016-12-01,2018-05-31,CA1,...,3.0,3.0,4.0,2.0,0.0,2018-05-31,17.0,2018-04-30 00:00:00,0,2018-05-01
4,201805,AS480037,1,2.0,XXXX,C02,F,2017-09-01,2018-05-31,CA1,...,1.5,1.5,2.0,1.0,0.0,2018-05-31,8.0,2018-04-30 00:00:00,0,2018-05-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4066,201903,PL545617,9,,XX,C01,F,2019-03-01,NaT,CA1,...,9.0,9.0,9.0,9.0,1.0,2019-04-30,1.0,,0,2019-03-01
4067,201903,TS878534,11,,XXXXX,C03,M,2019-03-12,NaT,CA1,...,11.0,11.0,11.0,11.0,0.0,2019-04-30,1.0,,0,2019-03-01
4068,201903,IK259509,10,,XXXX,C01,F,2019-03-06,NaT,CA1,...,10.0,10.0,10.0,10.0,1.0,2019-04-30,1.0,,0,2019-03-01
4069,201903,GD625169,8,,XXXX,C03,F,2019-03-06,NaT,CA1,...,8.0,8.0,8.0,8.0,0.0,2019-04-30,1.0,,0,2019-03-01


In [18]:
for i in range(len(predict_data)):
    rd = relativedelta(predict_data["start_date"][i], predict_data["date_now"][i])
    predict_data["period"][i] = abs(rd.years * 12 + rd.months)

predict_data

Unnamed: 0,연월,customer_id,count,cnt_last_mon,name,class,gender,start_date,end_date,campaign_id,...,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date,period,date_now
0,201805,AS030404,1,1.0,XXXX,C01,M,2017-05-01,2018-05-31,CA1,...,1.0,1.0,1.0,1.0,0.0,2018-05-31,12.0,2018-04-30 00:00:00,12,2018-05-01
1,201805,AS206541,4,1.0,XXXX,C03,M,2018-01-01,2018-05-31,CA1,...,2.5,2.5,4.0,1.0,1.0,2018-05-31,4.0,2018-04-30 00:00:00,4,2018-05-01
2,201805,AS354246,6,8.0,XXXXX,C01,M,2018-04-13,2018-05-31,CA3,...,7.0,7.0,8.0,6.0,0.0,2018-05-31,1.0,2018-04-30 00:00:00,0,2018-05-01
3,201805,AS383234,2,4.0,XXXXXX,C02,M,2016-12-01,2018-05-31,CA1,...,3.0,3.0,4.0,2.0,0.0,2018-05-31,17.0,2018-04-30 00:00:00,17,2018-05-01
4,201805,AS480037,1,2.0,XXXX,C02,F,2017-09-01,2018-05-31,CA1,...,1.5,1.5,2.0,1.0,0.0,2018-05-31,8.0,2018-04-30 00:00:00,8,2018-05-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4066,201903,PL545617,9,,XX,C01,F,2019-03-01,NaT,CA1,...,9.0,9.0,9.0,9.0,1.0,2019-04-30,1.0,,0,2019-03-01
4067,201903,TS878534,11,,XXXXX,C03,M,2019-03-12,NaT,CA1,...,11.0,11.0,11.0,11.0,0.0,2019-04-30,1.0,,0,2019-03-01
4068,201903,IK259509,10,,XXXX,C01,F,2019-03-06,NaT,CA1,...,10.0,10.0,10.0,10.0,1.0,2019-04-30,1.0,,0,2019-03-01
4069,201903,GD625169,8,,XXXX,C03,F,2019-03-06,NaT,CA1,...,8.0,8.0,8.0,8.0,0.0,2019-04-30,1.0,,0,2019-03-01


In [19]:
predict_data.isnull().sum()

연월                      0
customer_id             0
count                   0
cnt_last_mon          194
name                    0
class                   0
gender                  0
start_date              0
end_date             2842
campaign_id             0
is_deleted              0
class_name              0
price                   0
campaign_name           0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
calc_date               0
membership_period       0
exit_date            2842
period                  0
date_now                0
dtype: int64

- cnt_last_mon == null => signup members this month
- end_date, exit_date == null => staying members

=> remove newbie data

In [20]:
predict_data.dropna(subset=["cnt_last_mon"], inplace=True)
predict_data

Unnamed: 0,연월,customer_id,count,cnt_last_mon,name,class,gender,start_date,end_date,campaign_id,...,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date,period,date_now
0,201805,AS030404,1,1.0,XXXX,C01,M,2017-05-01,2018-05-31,CA1,...,1.000000,1.0,1.0,1.0,0.0,2018-05-31,12.0,2018-04-30 00:00:00,12,2018-05-01
1,201805,AS206541,4,1.0,XXXX,C03,M,2018-01-01,2018-05-31,CA1,...,2.500000,2.5,4.0,1.0,1.0,2018-05-31,4.0,2018-04-30 00:00:00,4,2018-05-01
2,201805,AS354246,6,8.0,XXXXX,C01,M,2018-04-13,2018-05-31,CA3,...,7.000000,7.0,8.0,6.0,0.0,2018-05-31,1.0,2018-04-30 00:00:00,0,2018-05-01
3,201805,AS383234,2,4.0,XXXXXX,C02,M,2016-12-01,2018-05-31,CA1,...,3.000000,3.0,4.0,2.0,0.0,2018-05-31,17.0,2018-04-30 00:00:00,17,2018-05-01
4,201805,AS480037,1,2.0,XXXX,C02,F,2017-09-01,2018-05-31,CA1,...,1.500000,1.5,2.0,1.0,0.0,2018-05-31,8.0,2018-04-30 00:00:00,8,2018-05-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4045,201903,OA148999,6,10.0,XXXXX,C03,M,2019-02-12,NaT,CA1,...,8.000000,8.0,10.0,6.0,0.0,2019-04-30,2.0,,0,2019-03-01
4047,201903,PL502162,9,8.0,XXXXX,C03,M,2019-02-06,NaT,CA1,...,8.500000,8.5,9.0,8.0,1.0,2019-04-30,2.0,,0,2019-03-01
4054,201903,TS566915,9,14.0,XXXXX,C03,M,2019-02-07,NaT,CA1,...,11.500000,11.5,14.0,9.0,1.0,2019-04-30,2.0,,0,2019-03-01
4063,201903,AS568373,6,8.0,XXXXX,C01,F,2019-02-10,NaT,CA1,...,7.000000,7.0,8.0,6.0,1.0,2019-04-30,2.0,,0,2019-03-01


### Change Category Variables

-  M or F in gender, CA1, CA2 in campaign_id  =>  classified by string (Category Variables)
-  To apply ML these data, it is required to change categories to integers

In [21]:
# extract required column for ML first
data = predict_data[["campaign_name", "class_name", "gender", "cnt_last_mon", "period", "is_deleted", "routine_flg"]]
data

Unnamed: 0,campaign_name,class_name,gender,cnt_last_mon,period,is_deleted,routine_flg
0,일반,종일,M,1.0,12,1.0,0.0
1,일반,야간,M,1.0,4,1.0,1.0
2,입회비무료,종일,M,8.0,0,1.0,0.0
3,일반,주간,M,4.0,17,1.0,0.0
4,일반,주간,F,2.0,8,1.0,0.0
...,...,...,...,...,...,...,...
4045,일반,야간,M,10.0,0,0.0,0.0
4047,일반,야간,M,8.0,0,0.0,1.0
4054,일반,야간,M,14.0,0,0.0,1.0
4063,일반,종일,F,8.0,0,0.0,1.0


In [22]:
# convert data
data = pd.get_dummies(data=data)
data

Unnamed: 0,cnt_last_mon,period,is_deleted,routine_flg,campaign_name_일반,campaign_name_입회비무료,campaign_name_입회비반액할인,class_name_야간,class_name_종일,class_name_주간,gender_F,gender_M
0,1.0,12,1.0,0.0,True,False,False,False,True,False,False,True
1,1.0,4,1.0,1.0,True,False,False,True,False,False,False,True
2,8.0,0,1.0,0.0,False,True,False,False,True,False,False,True
3,4.0,17,1.0,0.0,True,False,False,False,False,True,False,True
4,2.0,8,1.0,0.0,True,False,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
4045,10.0,0,0.0,0.0,True,False,False,True,False,False,False,True
4047,8.0,0,0.0,1.0,True,False,False,True,False,False,False,True
4054,14.0,0,0.0,1.0,True,False,False,True,False,False,False,True
4063,8.0,0,0.0,1.0,True,False,False,False,True,False,True,False


In [23]:
# remove unnecessary columns
del data["gender_M"]

In [24]:
data

Unnamed: 0,cnt_last_mon,period,is_deleted,routine_flg,campaign_name_일반,campaign_name_입회비무료,campaign_name_입회비반액할인,class_name_야간,class_name_종일,class_name_주간,gender_F
0,1.0,12,1.0,0.0,True,False,False,False,True,False,False
1,1.0,4,1.0,1.0,True,False,False,True,False,False,False
2,8.0,0,1.0,0.0,False,True,False,False,True,False,False
3,4.0,17,1.0,0.0,True,False,False,False,False,True,False
4,2.0,8,1.0,0.0,True,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...
4045,10.0,0,0.0,0.0,True,False,False,True,False,False,False
4047,8.0,0,0.0,1.0,True,False,False,True,False,False,False
4054,14.0,0,0.0,1.0,True,False,False,True,False,False,False
4063,8.0,0,0.0,1.0,True,False,False,False,True,False,True


### Decision Tree to Predict Members Behavior

- A decision tree is a visual representation of the potential outcomes of a series of choices.

In [25]:
# import DecisionTree package
from sklearn.tree import DecisionTreeClassifier

# import model_selection for train_test_split
from sklearn import model_selection

exit = data[data["is_deleted"] == 1]
# make the number of stay same as the number of exit, with sample
stay = data[data["is_deleted"] == 0].sample(len(exit))

print(exit.shape)
print(stay.shape)

(1229, 11)
(1229, 11)


In [26]:
# create train and test data
X = pd.concat([exit, stay], ignore_index=True)
Y = X["is_deleted"]
del X["is_deleted"]

In [27]:
# learning with Decision Tree
model = DecisionTreeClassifier()

# splite data into test and train
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y)

# learning train data
model.fit(x_train, y_train)

# predict test data
y_pred = model.predict(x_test)

# compare with real test data
result = pd.DataFrame({"y_test": y_test, "y_pred": y_pred})
result

Unnamed: 0,y_test,y_pred
614,1.0,1.0
1421,0.0,0.0
1423,0.0,0.0
1958,0.0,1.0
706,1.0,1.0
...,...,...
1795,0.0,0.0
879,1.0,1.0
769,1.0,1.0
2061,0.0,0.0


In [28]:
# Accuracy of Prediction?
correct = len(result[result["y_test"] == result["y_pred"]])
total = len(result)

# same as 
# print(model.score(x_train, y_train))
# print(model.score(x_test, y_test))
print(f"Accuracy of Learning: {model.score(x_train, y_train)} %")
print(f"Accuracy of Prediction: {correct * 100/ total} %")

Accuracy of Learning: 0.9837221920781335 %
Accuracy of Prediction: 89.4308943089431 %


### Tuning Model to Increase Accuracy

- Learning Accuracy is higher than that of prediction, and difference is quite big.
  => Overfit to learning(train) data.
- Trimming the depth of tree (simplifying model with decreasing depth)

In [29]:
# recreate model instance with limited tree depth
model = DecisionTreeClassifier(max_depth=5)

# learning on new model instance
model.fit(x_train, y_train)

print(f"Accuracy of Learning: {model.score(x_train, y_train)} %")
print(f"Accuracy of Prediction: {model.score(x_test, y_test)} %")

Accuracy of Learning: 0.9386869234943027 %
Accuracy of Prediction: 0.9300813008130081 %


In [30]:
# check effective variables
coef = pd.DataFrame({
    "feature_names": X.columns,
    "coef": model.feature_importances_   # 변수 중요도; model마다 method name이 다름.
})

coef

Unnamed: 0,feature_names,coef
0,cnt_last_mon,0.427876
1,period,0.381376
2,routine_flg,0.167937
3,campaign_name_일반,0.011754
4,campaign_name_입회비무료,0.006857
5,campaign_name_입회비반액할인,0.0
6,class_name_야간,0.0
7,class_name_종일,0.001413
8,class_name_주간,0.002786
9,gender_F,0.0


### Predict Unknown Data

In [52]:
# new data
cnt_last_mon = 5
routine_flag = 1
period = 10
campaign_name = "일반"
class_name = "종일"
gender = "M"


In [53]:
# systemize unknown data

gender_list = [ gender == "M" ]
class_list = [ class_name == "야간", class_name =="종일", class_name == "주간" ]
campaign_list = [ campaign_name == "일반", campaign_name == "입회비무료", campaign_name == "입회비반액할인"]

unknown_data = [cnt_last_mon, period, routine_flag] + campaign_list + class_list + gender_list
quit_prediction = model.predict([unknown_data])
result = "quit" if quit_prediction == 1 else "Stay"
probability = model.predict_proba([unknown_data])

print(f"Prediction: {result}")
print(f"Probability of Result: Stay={probability[0][0]}, Quit={probability[0][1]}")

Prediction: Stay
Probability of Result: Stay=0.831858407079646, Quit=0.168141592920354
