In [28]:
#!/usr/bin/python3
# -*-coding:utf-8
'''
Created on Fri Dec 1 22:22:35 2017

@author: Ray

'''
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import utils # written by author
from glob import glob
from datetime import datetime, timedelta
import multiprocessing as mp
import gc # for automatic releasing memory


##################################################
# Load transaction 
##################################################
input_col = ['payment_plan_days','transaction_date', 'membership_expire_date']
transactions = utils.read_multiple_csv('../../input/preprocessed_data/transactions',input_col)

#==============================================================================
print('reduce memory')
#==============================================================================
utils.reduce_memory(transactions)

100%|██████████| 4/4 [00:28<00:00,  7.24s/it]


reduce memory


100%|██████████| 3/3 [00:00<00:00, 966.21it/s]


In [29]:
transactions = transactions.head(n = 500)
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 3 columns):
payment_plan_days         500 non-null int16
transaction_date          500 non-null object
membership_expire_date    500 non-null object
dtypes: int16(1), object(2)
memory usage: 12.7+ KB


In [31]:
##################################################
# Convert string to datetime format
##################################################
transactions['membership_expire_date']  = transactions.membership_expire_date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
transactions['transaction_date']  = transactions.transaction_date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

##################################################
# For membership_loyalty
##################################################
transactions['membership_duration'] = [i.days for i in (transactions.membership_expire_date - transactions.transaction_date)]
transactions['is_membership_duration_equal_to_plan_days'] = [1 if m_d==p_d else 0 for m_d, p_d in transactions[['membership_duration','payment_plan_days']].values]
transactions['is_membership_duration_longer_than_plan_days'] = [1 if m_d > p_d else 0 for m_d, p_d in transactions[['membership_duration','payment_plan_days']].values]
transactions['days_longer_than_plan_days'] = [i if i > 0 else 0 for i in (transactions.membership_duration - transactions.payment_plan_days)]

##################################################
# 到期日在交易日之前...
##################################################

transactions['is_early_expiration'] = [1 if i.days < 0 else 0 for i in (transactions.membership_expire_date - transactions.transaction_date)]
transactions['early_expiration_days'] = [-i.days if i.days < 0 else 0 for i in (transactions.membership_expire_date - transactions.transaction_date)]


In [32]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 9 columns):
payment_plan_days                               500 non-null int16
transaction_date                                500 non-null datetime64[ns]
membership_expire_date                          500 non-null datetime64[ns]
membership_duration                             500 non-null int64
is_membership_duration_equal_to_plan_days       500 non-null int64
is_membership_duration_longer_than_plan_days    500 non-null int64
days_longer_than_plan_days                      500 non-null int64
is_early_expiration                             500 non-null int64
early_expiration_days                           500 non-null int64
dtypes: datetime64[ns](2), int16(1), int64(6)
memory usage: 56.1 KB


In [34]:
def reduce_memory(df, ix_start=0):
	# There are two concepts: 1. change type, 2 drop NaN(missing) value
    #df.fillna(-1, inplace=True)
    if df.shape[0] <= 500:
        df_ = df.sample(50, random_state=71) # sampling for the following comoparison
    else:
        df_ = df.sample(9999, random_state=71)
    ## int
    col_int8 = []
    col_int16 = []
    col_int32 = []
    for c in tqdm(df.columns[ix_start:], miniters=20):
        if df[c].dtype =='O':
        	# '0': Objects
            continue
        elif df[c].dtype == 'datetime64[ns]':
            continue
        elif (df_[c] == df_[c].astype(np.int8)).all():
        	# Series.all():return true if all elements in Series is True
            col_int8.append(c)
        elif (df_[c] == df_[c].astype(np.int16)).all():
            col_int16.append(c)
        elif (df_[c] == df_[c].astype(np.int32)).all():
            col_int32.append(c)
    
    df[col_int8]  = df[col_int8].astype(np.int8)
    df[col_int16] = df[col_int16].astype(np.int16)
    df[col_int32] = df[col_int32].astype(np.int32)
    
    ## float
    col = [c for c in df.dtypes[df.dtypes==np.float64].index]
    df[col] = df[col].astype(np.float32)

    gc.collect()
reduce_memory(transactions)


100%|██████████| 9/9 [00:00<00:00, 2836.33it/s]


In [35]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 9 columns):
payment_plan_days                               500 non-null int16
transaction_date                                500 non-null datetime64[ns]
membership_expire_date                          500 non-null datetime64[ns]
membership_duration                             500 non-null int16
is_membership_duration_equal_to_plan_days       500 non-null int8
is_membership_duration_longer_than_plan_days    500 non-null int8
days_longer_than_plan_days                      500 non-null int8
is_early_expiration                             500 non-null int8
early_expiration_days                           500 non-null int8
dtypes: datetime64[ns](2), int16(2), int8(5)
memory usage: 36.1 KB


In [36]:
transactions.head()

Unnamed: 0,payment_plan_days,transaction_date,membership_expire_date,membership_duration,is_membership_duration_equal_to_plan_days,is_membership_duration_longer_than_plan_days,days_longer_than_plan_days,is_early_expiration,early_expiration_days
0,7,2016-09-09,2016-09-14,5,0,0,0,0,0
1,410,2015-11-21,2017-01-04,410,1,0,0,0,0
2,395,2016-10-23,2018-02-06,471,0,1,76,0,0
3,30,2016-11-16,2016-12-15,29,0,0,0,0,0
4,30,2016-12-15,2017-01-15,31,0,1,1,0,0


In [38]:
##################################################
# Load transaction 
##################################################
input_col = ['msno','plan_list_price','actual_amount_paid','payment_plan_days']
transactions = utils.read_multiple_csv('../../input/preprocessed_data/transactions',input_col)


100%|██████████| 4/4 [00:29<00:00,  7.25s/it]


In [39]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22964064 entries, 0 to 5741015
Data columns (total 4 columns):
msno                  object
plan_list_price       int64
actual_amount_paid    int64
payment_plan_days     int64
dtypes: int64(3), object(1)
memory usage: 876.0+ MB


In [40]:
#==============================================================================
print('reduce memory')
#==============================================================================
utils.reduce_memory(transactions)

##################################################
# discount
##################################################
transactions['discount'] = transactions['plan_list_price'] - transactions['actual_amount_paid']
transactions['is_discount'] = transactions.discount.apply(lambda x: 1 if x > 0 else 0)

##################################################
# cp value
##################################################
transactions['amt_per_day'] = transactions['actual_amount_paid'] / transactions['payment_plan_days']
transactions['cp_value'] = transactions['plan_list_price'] / transactions['payment_plan_days']


reduce memory


100%|██████████| 4/4 [00:00<00:00, 1605.94it/s]


In [41]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22964064 entries, 0 to 5741015
Data columns (total 8 columns):
msno                  object
plan_list_price       int16
actual_amount_paid    int16
payment_plan_days     int16
discount              int16
is_discount           int64
amt_per_day           float64
cp_value              float64
dtypes: float64(2), int16(4), int64(1), object(1)
memory usage: 1.0+ GB


In [42]:
utils.reduce_memory(transactions)


  0%|          | 0/8 [00:00<?, ?it/s]


ValueError: Cannot convert non-finite values (NA or inf) to integer