In [42]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
data_folder = Path.cwd().parents[0].joinpath('data', 'processed_data')

In [3]:
# Read model dataset
model_data_df = pd.read_csv(data_folder.joinpath('model_dataset_with_feature_engg.csv'))
model_data_df.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,...,avg_num_75,avg_num_985,avg_num_100,avg_num_unq,avg_total_secs,number_of_days_used,price_discount,is_discount,cut_off_date,num_days_as_member
0,++4RuqBw0Ss6bQU4oMxaRlbBPoWzoEiIZaxPM04Y4+U=,0,1.0,0.0,,7.0,2014-07-14,41,30,149,...,0.0,1.0,4.0,7.0,1368.19104,1.0,0,0,2017-02-28,960.0
1,+/HS8LzrRGXolKbxRzDLqrmwuXqPOYixBIPXkyNcKNI=,0,,,,,,40,30,149,...,,,,,,,0,0,2017-02-28,
2,+/namlXq+u3izRjHCFJV4MgqcXcLidZYszVsROOq/y4=,0,15.0,31.0,male,9.0,2006-06-03,34,30,149,...,1.076923,1.153846,27.923077,51.807692,7806.437843,26.0,0,0,2017-02-28,3923.0
3,+0/X9tkmyHyet9X80G6GTrDFHnJqvai8d1ZPhayT0os=,0,9.0,31.0,male,9.0,2004-03-30,34,30,149,...,0.793103,2.37931,49.310345,21.310345,12545.216048,29.0,0,0,2017-02-28,4718.0
4,+09YGn842g6h2EZUXe0VWeC4bBoCbDGfUboitc0vIHw=,0,15.0,29.0,male,9.0,2008-03-22,34,30,149,...,0.576923,0.961538,32.961538,26.230769,8618.035774,26.0,0,0,2017-02-28,3265.0


In [4]:
model_data_df.dtypes

msno                       object
is_churn                    int64
city                      float64
bd                        float64
gender                     object
registered_via            float64
registration_init_time     object
payment_method_id           int64
payment_plan_days           int64
plan_list_price             int64
actual_amount_paid          int64
is_auto_renew               int64
transaction_date           object
membership_expire_date     object
is_cancel                   int64
avg_num_25                float64
avg_num_50                float64
avg_num_75                float64
avg_num_985               float64
avg_num_100               float64
avg_num_unq               float64
avg_total_secs            float64
number_of_days_used       float64
price_discount              int64
is_discount                 int64
cut_off_date               object
num_days_as_member        float64
dtype: object

In [5]:
model_data_df.isna().mean() * 100

msno                       0.000000
is_churn                   0.000000
city                      11.659420
bd                        11.659420
gender                    60.551942
registered_via            11.659420
registration_init_time    11.659420
payment_method_id          0.000000
payment_plan_days          0.000000
plan_list_price            0.000000
actual_amount_paid         0.000000
is_auto_renew              0.000000
transaction_date           0.000000
membership_expire_date     0.000000
is_cancel                  0.000000
avg_num_25                22.737028
avg_num_50                22.737028
avg_num_75                22.737028
avg_num_985               22.737028
avg_num_100               22.737028
avg_num_unq               22.737028
avg_total_secs            22.737028
number_of_days_used       22.737028
price_discount             0.000000
is_discount                0.000000
cut_off_date               0.000000
num_days_as_member        11.659420
dtype: float64

## Data cleaning

### bd feature

In [14]:
# Convert all negative values to positive values
model_data_df.loc[:, 'bd'] = abs(model_data_df['bd']) 

In [20]:
model_data_df['bd'].describe()

count    877161.000000
mean         13.475045
std          20.212833
min           0.000000
25%           0.000000
50%           0.000000
75%          27.000000
max        3152.000000
Name: bd, dtype: float64

In [25]:
# Make all bd records > 75 to 75
model_data_df.loc[model_data_df['bd'] > 75, 'bd'] = 75

# Train-test split

In [27]:
train_df, val_df = train_test_split(model_data_df, test_size=0.2, random_state=10)

In [28]:
model_data_df.dtypes

msno                       object
is_churn                    int64
city                      float64
bd                        float64
gender                     object
registered_via            float64
registration_init_time     object
payment_method_id           int64
payment_plan_days           int64
plan_list_price             int64
actual_amount_paid          int64
is_auto_renew               int64
transaction_date           object
membership_expire_date     object
is_cancel                   int64
avg_num_25                float64
avg_num_50                float64
avg_num_75                float64
avg_num_985               float64
avg_num_100               float64
avg_num_unq               float64
avg_total_secs            float64
number_of_days_used       float64
price_discount              int64
is_discount                 int64
cut_off_date               object
num_days_as_member        float64
dtype: object

## Data imputation

In [31]:
## Numeric features imputer
numeric_feature_imputer = SimpleImputer(strategy='median')
numeric_columns = ['plan_list_price', 'actual_amount_paid', 'bd', 'avg_num_25', 'avg_num_50', 
                   'avg_num_75', 'avg_num_985', 'avg_num_100', 'avg_num_unq', 'avg_total_secs', 
                   'number_of_days_used', 'price_discount', 'num_days_as_member']

## Categorical features imputer
categorical_feature_imputer = SimpleImputer(strategy='most_frequent')
categorical_columns = ['payment_method_id', 'payment_plan_days', 'is_auto_renew', 'is_cancel', 
                       'city', 'registered_via', 'is_discount']

## Constant imputer
constant_imputer = SimpleImputer(strategy='constant', fill_value='not_specified')
constant_impute_column = ['gender']
constant_imputer_pipeline = Pipeline([('imputer', constant_imputer), ('label_encoder', OrdinalEncoder())])

## Combine all the imputation transformers with Column Transformer
column_transformer = ColumnTransformer(transformers=[('numeric_imputer', numeric_feature_imputer, numeric_columns),
                                                     ('categorical_imputer', categorical_feature_imputer, categorical_columns),
                                                     ('constant_imputer', constant_imputer_pipeline, constant_impute_column)])

## Final pipeline with scaling
pipeline = Pipeline([('imputation_step', column_transformer)])

In [35]:
# Fit data imputation pipeline
pipeline.fit(train_df)

Pipeline(steps=[('imputation_step',
                 ColumnTransformer(transformers=[('numeric_imputer',
                                                  SimpleImputer(strategy='median'),
                                                  ['plan_list_price',
                                                   'actual_amount_paid', 'bd',
                                                   'avg_num_25', 'avg_num_50',
                                                   'avg_num_75', 'avg_num_985',
                                                   'avg_num_100', 'avg_num_unq',
                                                   'avg_total_secs',
                                                   'number_of_days_used',
                                                   'price_discount',
                                                   'num_days_as_member']),
                                                 ('categorical_imputer',
                                                  SimpleImpute

In [40]:
# Create train and val data
x_train, y_train = pipeline.transform(train_df), train_df['is_churn'].values
x_val, y_val = pipeline.transform(val_df), val_df['is_churn'].values

In [44]:
# Save all the numpy arrays
np.save(data_folder.joinpath('x_train.npy'), x_train)
np.save(data_folder.joinpath('y_train.npy'), y_train)

np.save(data_folder.joinpath('x_val.npy'), x_val)
np.save(data_folder.joinpath('y_val.npy'), y_val)