In [67]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
pd.options.display.float_format = '{:,.4f}'.format
N_JOBS = 10

In [2]:
# Functions
def same_columns(dfin):
    same_columns = {}
    for c in dfin.columns:
        if dfin[c].value_counts().shape[0] == 1:
            same_columns[c] = list(dfin[c].unique())
    return same_columns

def find_columns_with_nans(dfin):
    _dfnans = pd.DataFrame()
    _dfnans['counts'] = dfin[dfin.columns[dfin.isnull().any()]].isnull().sum()
    _dfnans['perc'] = 100 * _dfnans['counts'] / dfin.shape[0]
    _dfnans = _dfnans.reset_index()
    _dfnans['uniq_counts'] = _dfnans['index'].apply(lambda x: len(dfin[x].unique()))
    _dfnans['dtype'] = _dfnans['index'].apply(lambda x: dfin[x].dtype)
    return _dfnans

def drop_columns_with_nans(dfin, perc=40, drop_cols=True):
    too_many_nans = dfin[dfin.columns[dfin.isnull().sum() * 100 / dfin.shape[0] > perc]].columns.tolist()
    print(f"Cols with {perc}% or more nans: {len(too_many_nans)}")
    if drop_cols:
        dfin = dfin.drop(too_many_nans, axis=1)
        print(f"After dropping: {dfin.shape}")
    return dfin, too_many_nans

In [3]:
df = pd.read_csv("./train.csv", low_memory=False)
df.shape

(69999, 172)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69999 entries, 0 to 69998
Columns: 172 entries, id to churn_probability
dtypes: float64(135), int64(28), object(9)
memory usage: 91.9+ MB


In [5]:
for c in df.columns:
    print(f'{c.ljust(25)}: {str(df[c].dtype).rjust(10)}  {len(df[c].unique())}')
    if len(df[c].unique()) < 7:
        print(f' - {df[c].unique()}')

id                       :      int64  69999
circle_id                :      int64  1
 - [109]
loc_og_t2o_mou           :    float64  2
 - [ 0. nan]
std_og_t2o_mou           :    float64  2
 - [ 0. nan]
loc_ic_t2o_mou           :    float64  2
 - [ 0. nan]
last_date_of_month_6     :     object  1
 - ['6/30/2014']
last_date_of_month_7     :     object  2
 - ['7/31/2014' nan]
last_date_of_month_8     :     object  2
 - ['8/31/2014' nan]
arpu_6                   :    float64  61615
arpu_7                   :    float64  61425
arpu_8                   :    float64  60194
onnet_mou_6              :    float64  20058
onnet_mou_7              :    float64  20077
onnet_mou_8              :    float64  19726
offnet_mou_6             :    float64  26255
offnet_mou_7             :    float64  26140
offnet_mou_8             :    float64  25996
roam_ic_mou_6            :    float64  5307
roam_ic_mou_7            :    float64  4241
roam_ic_mou_8            :    float64  4253
roam_og_mou_6           

In [6]:
col_names_only = []
for c in df.columns:
    stripped = c
    if c.endswith('_6') or c.endswith('_7') or c.endswith('_8'):
        stripped = c[:-2]
    if stripped not in col_names_only:
        col_names_only.append(stripped)
        print(stripped, len(col_names_only))

id 1
circle_id 2
loc_og_t2o_mou 3
std_og_t2o_mou 4
loc_ic_t2o_mou 5
last_date_of_month 6
arpu 7
onnet_mou 8
offnet_mou 9
roam_ic_mou 10
roam_og_mou 11
loc_og_t2t_mou 12
loc_og_t2m_mou 13
loc_og_t2f_mou 14
loc_og_t2c_mou 15
loc_og_mou 16
std_og_t2t_mou 17
std_og_t2m_mou 18
std_og_t2f_mou 19
std_og_t2c_mou 20
std_og_mou 21
isd_og_mou 22
spl_og_mou 23
og_others 24
total_og_mou 25
loc_ic_t2t_mou 26
loc_ic_t2m_mou 27
loc_ic_t2f_mou 28
loc_ic_mou 29
std_ic_t2t_mou 30
std_ic_t2m_mou 31
std_ic_t2f_mou 32
std_ic_t2o_mou 33
std_ic_mou 34
total_ic_mou 35
spl_ic_mou 36
isd_ic_mou 37
ic_others 38
total_rech_num 39
total_rech_amt 40
max_rech_amt 41
date_of_last_rech 42
last_day_rch_amt 43
date_of_last_rech_data 44
total_rech_data 45
max_rech_data 46
count_rech_2g 47
count_rech_3g 48
av_rech_amt_data 49
vol_2g_mb 50
vol_3g_mb 51
arpu_3g 52
arpu_2g 53
night_pck_user 54
monthly_2g 55
sachet_2g 56
monthly_3g 57
sachet_3g 58
fb_user 59
aon 60
aug_vbc_3g 61
jul_vbc_3g 62
jun_vbc_3g 63
churn_probability 64

In [7]:
df.head()

Unnamed: 0,id,circle_id,loc_og_t2o_mou,std_og_t2o_mou,loc_ic_t2o_mou,last_date_of_month_6,last_date_of_month_7,last_date_of_month_8,arpu_6,arpu_7,...,sachet_3g_7,sachet_3g_8,fb_user_6,fb_user_7,fb_user_8,aon,aug_vbc_3g,jul_vbc_3g,jun_vbc_3g,churn_probability
0,0,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,31.277,87.009,...,0,0,,,,1958,0.0,0.0,0.0,0
1,1,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,0.0,122.787,...,0,0,,1.0,,710,0.0,0.0,0.0,0
2,2,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,60.806,103.176,...,0,0,,,,882,0.0,0.0,0.0,0
3,3,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,156.362,205.26,...,0,0,,,,982,0.0,0.0,0.0,0
4,4,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,240.708,128.191,...,1,0,1.0,1.0,1.0,647,0.0,0.0,0.0,0


## Find column types

In [8]:
df.shape

(69999, 172)

In [9]:
dropped_columns = []
id_columns = ['id', 'circle_id']
df = df.drop(id_columns, axis=1)
df.shape

(69999, 170)

In [10]:
all_columns = list(df.columns)
date_columns = []
def get_cols_by_name(name):
    arr = []
    for c in all_columns:
        if name in c:
            arr.append(c)
    return arr
datecols = get_cols_by_name("date")
print(len(datecols))

9


In [11]:
dfnans = find_columns_with_nans(df)
dfnans.shape

(125, 5)

In [12]:
recharge_columns = [ x for x in list(dfnans['index']) if "rech" in x ]
print(recharge_columns)
recharge_columns_no_date = [x for x in recharge_columns if not "date" in x]
print(recharge_columns_no_date)
only_dates = [x for x in recharge_columns if "date" in x]
print(only_dates)

['date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8', 'date_of_last_rech_data_6', 'date_of_last_rech_data_7', 'date_of_last_rech_data_8', 'total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8', 'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8']
['total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8', 'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8']
['date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8', 'date_of_last_rech_data_6', 'date_of_last_rech_data_7', 'date_of_last_rech_data_8']


In [13]:
len(recharge_columns_no_date)

15

### We assume missing values in recharge columns are same as 0

In [14]:
df[recharge_columns_no_date] = df[recharge_columns_no_date].apply(lambda x: x.fillna(0))

In [15]:
dfn2 = find_columns_with_nans(df)
dfn2.shape

(110, 5)

In [16]:
df = df.drop(only_dates, axis=1)
df.shape

(69999, 164)

In [17]:
last_date_cols = ['last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8']
df = df.drop(last_date_cols, axis=1)
df.shape

(69999, 161)

In [18]:
dfn3 = find_columns_with_nans(df)
for c in dfn3['index']:
    if len(df[c].unique()) < 10:
        print(c, df[c].unique(), list(df[c].value_counts()))
dfn3.shape

loc_og_t2o_mou [ 0. nan] [69297]
std_og_t2o_mou [ 0. nan] [69297]
loc_ic_t2o_mou [ 0. nan] [69297]
std_og_t2c_mou_6 [ 0. nan] [67231]
std_og_t2c_mou_7 [ 0. nan] [67312]
std_og_t2c_mou_8 [ 0. nan] [66296]
std_ic_t2o_mou_6 [ 0. nan] [67231]
std_ic_t2o_mou_7 [ 0. nan] [67312]
std_ic_t2o_mou_8 [ 0. nan] [66296]
night_pck_user_6 [nan  0.  1.] [17124, 444]
night_pck_user_7 [nan  0.  1.] [17435, 430]
night_pck_user_8 [nan  0.  1.] [18030, 387]
fb_user_6 [nan  1.  0.] [16098, 1470]
fb_user_7 [nan  1.  0.] [16249, 1616]
fb_user_8 [nan  1.  0.] [16397, 2020]


(102, 5)

### for night_* and fb_user_* impute with -1

In [19]:
fb_night_cols = ['night_pck_user_6', 'night_pck_user_7', 'night_pck_user_8', 'fb_user_6', 'fb_user_7', 'fb_user_8']
df[fb_night_cols] = df[fb_night_cols].apply(lambda x: x.fillna(-1))

In [20]:
dfn4 = find_columns_with_nans(df)
dfn4.shape

(96, 5)

### Drop columns with more than 40% nans

In [21]:
df.shape

(69999, 161)

In [22]:
df, cols_with_nans = drop_columns_with_nans(df)
df.shape

Cols with 40% or more nans: 6
After dropping: (69999, 155)


(69999, 155)

In [23]:
dfn5 = find_columns_with_nans(df)
impute_values = {}
for i,col in enumerate(list(dfn5['index'])):
    if df[col].dtype == np.float64 or df[col].dtype == np.int64:
        print(f'{col}, float, {dfn5["perc"][i]:.3f} {df[col].mean():.2f}, {df[col].median():.2f}, {df[col].std():.2f}')
        df[col] = df[col].fillna(df[col].median())
        impute_values[col] = df[col].median()
    else:
        df[col] = df[col].fillna(df[col].mode(dropna=True)[0])
dfn5.shape

loc_og_t2o_mou, float, 1.003 0.00, 0.00, 0.00
std_og_t2o_mou, float, 1.003 0.00, 0.00, 0.00
loc_ic_t2o_mou, float, 1.003 0.00, 0.00, 0.00
onnet_mou_6, float, 3.954 133.15, 34.11, 299.96
onnet_mou_7, float, 3.839 133.89, 32.28, 311.28
onnet_mou_8, float, 5.290 132.98, 32.10, 311.90
offnet_mou_6, float, 3.954 198.87, 96.48, 316.82
offnet_mou_7, float, 3.839 197.15, 91.88, 322.48
offnet_mou_8, float, 5.290 196.54, 91.80, 324.09
roam_ic_mou_6, float, 3.954 9.77, 0.00, 57.37
roam_ic_mou_7, float, 3.839 7.01, 0.00, 55.96
roam_ic_mou_8, float, 5.290 7.00, 0.00, 53.41
roam_og_mou_6, float, 3.954 14.19, 0.00, 73.47
roam_og_mou_7, float, 3.839 9.84, 0.00, 58.51
roam_og_mou_8, float, 5.290 9.77, 0.00, 64.62
loc_og_t2t_mou_6, float, 3.954 46.90, 11.91, 150.97
loc_og_t2t_mou_7, float, 3.839 46.17, 11.58, 154.74
loc_og_t2t_mou_8, float, 5.290 45.69, 11.74, 153.72
loc_og_t2m_mou_6, float, 3.954 93.24, 41.03, 162.05
loc_og_t2m_mou_7, float, 3.839 90.80, 40.17, 153.85
loc_og_t2m_mou_8, float, 5.290 91.

(90, 5)

In [24]:
# dfX = df.drop("churn_probability", axis = 1)
# dfc = df.copy()
# from sklearn.impute import KNNImputer, SimpleImputer
# # imputer = KNNImputer(n_neighbors=3)
# imputer = SimpleImputer(strategy="mean")
# imputer.fit(df)
# dft=imputer.transform(df)
# df["churn_probability"].equals(dfc["churn_probability"])

In [25]:
find_columns_with_nans(df)

Unnamed: 0,index,counts,perc,uniq_counts,dtype


In [26]:
df.shape

(69999, 155)

In [27]:
sc = same_columns(df)
sc

{'loc_og_t2o_mou': [0.0],
 'std_og_t2o_mou': [0.0],
 'loc_ic_t2o_mou': [0.0],
 'std_og_t2c_mou_6': [0.0],
 'std_og_t2c_mou_7': [0.0],
 'std_og_t2c_mou_8': [0.0],
 'std_ic_t2o_mou_6': [0.0],
 'std_ic_t2o_mou_7': [0.0],
 'std_ic_t2o_mou_8': [0.0]}

In [28]:
same_cols = sc.keys()
df = df.drop(same_cols,axis=1)
df.shape

(69999, 146)

In [29]:
df['churn_probability'].value_counts() / df.shape[0]

0   0.8981
1   0.1019
Name: churn_probability, dtype: float64

## Process test data

In [30]:
dfnoy = df.drop("churn_probability", axis = 1)
train_cols = list(dfnoy.columns)

In [31]:
dft = pd.read_csv("./test.csv", low_memory=False)
dft.shape

(30000, 171)

In [32]:
dftid = pd.DataFrame()
dftid['id'] = dft['id']

In [33]:
dft = dft.filter(train_cols,axis=1)
dft.shape

(30000, 145)

In [34]:
find_columns_with_nans(dft)

Unnamed: 0,index,counts,perc,uniq_counts,dtype
0,onnet_mou_6,1169,3.8967,12284,float64
1,onnet_mou_7,1172,3.9067,12177,float64
2,onnet_mou_8,1675,5.5833,12055,float64
3,offnet_mou_6,1169,3.8967,16383,float64
4,offnet_mou_7,1172,3.9067,16152,float64
...,...,...,...,...,...
97,night_pck_user_7,22294,74.3133,3,float64
98,night_pck_user_8,22078,73.5933,3,float64
99,fb_user_6,22415,74.7167,3,float64
100,fb_user_7,22294,74.3133,3,float64


In [35]:
dft[recharge_columns_no_date] = dft[recharge_columns_no_date].apply(lambda x: x.fillna(0))

In [36]:
dft[fb_night_cols] = dft[fb_night_cols].apply(lambda x: x.fillna(-1))

In [37]:
for col, v in impute_values.items():
    if col in dft.columns:
        dft[col] = dft[col].fillna(v)

In [38]:
find_columns_with_nans(dft)

Unnamed: 0,index,counts,perc,uniq_counts,dtype


### Derive some features as discussed in session

In [39]:
for_derived_feats = ["arpu", "onnet_mou", "offnet_mou", "roam_ic_mou", "roam_og_mou", "loc_og_mou", "std_og_mou", 
 "isd_og_mou", "spl_og_mou", "total_og_mou", "loc_ic_mou", "std_ic_mou", "isd_ic_mou", "spl_ic_mou", 
 "total_ic_mou", "total_rech_num", "total_rech_amt", "max_rech_amt", "total_rech_data", 
 "max_rech_data", "av_rech_amt_data", "vol_2g_mb", "vol_3g_mb"]

print(df.shape)
for c in for_derived_feats:
    c_diff, c6, c7, c8 = c + "_diff", c + "_6", c + "_7", c + "_8"
    print(c_diff, c6, c7, c8)
    df[c_diff] = df[c8] - (df[c6] + df[c7])/2
print(df.shape)

(69999, 146)
arpu_diff arpu_6 arpu_7 arpu_8
onnet_mou_diff onnet_mou_6 onnet_mou_7 onnet_mou_8
offnet_mou_diff offnet_mou_6 offnet_mou_7 offnet_mou_8
roam_ic_mou_diff roam_ic_mou_6 roam_ic_mou_7 roam_ic_mou_8
roam_og_mou_diff roam_og_mou_6 roam_og_mou_7 roam_og_mou_8
loc_og_mou_diff loc_og_mou_6 loc_og_mou_7 loc_og_mou_8
std_og_mou_diff std_og_mou_6 std_og_mou_7 std_og_mou_8
isd_og_mou_diff isd_og_mou_6 isd_og_mou_7 isd_og_mou_8
spl_og_mou_diff spl_og_mou_6 spl_og_mou_7 spl_og_mou_8
total_og_mou_diff total_og_mou_6 total_og_mou_7 total_og_mou_8
loc_ic_mou_diff loc_ic_mou_6 loc_ic_mou_7 loc_ic_mou_8
std_ic_mou_diff std_ic_mou_6 std_ic_mou_7 std_ic_mou_8
isd_ic_mou_diff isd_ic_mou_6 isd_ic_mou_7 isd_ic_mou_8
spl_ic_mou_diff spl_ic_mou_6 spl_ic_mou_7 spl_ic_mou_8
total_ic_mou_diff total_ic_mou_6 total_ic_mou_7 total_ic_mou_8
total_rech_num_diff total_rech_num_6 total_rech_num_7 total_rech_num_8
total_rech_amt_diff total_rech_amt_6 total_rech_amt_7 total_rech_amt_8
max_rech_amt_diff max_re

In [40]:
print(dft.shape)
for c in for_derived_feats:
    c_diff, c6, c7, c8 = c + "_diff", c + "_6", c + "_7", c + "_8"
    print(c_diff, c6, c7, c8)
    dft[c_diff] = dft[c8] - (dft[c6] + dft[c7])/2
print(dft.shape)

(30000, 145)
arpu_diff arpu_6 arpu_7 arpu_8
onnet_mou_diff onnet_mou_6 onnet_mou_7 onnet_mou_8
offnet_mou_diff offnet_mou_6 offnet_mou_7 offnet_mou_8
roam_ic_mou_diff roam_ic_mou_6 roam_ic_mou_7 roam_ic_mou_8
roam_og_mou_diff roam_og_mou_6 roam_og_mou_7 roam_og_mou_8
loc_og_mou_diff loc_og_mou_6 loc_og_mou_7 loc_og_mou_8
std_og_mou_diff std_og_mou_6 std_og_mou_7 std_og_mou_8
isd_og_mou_diff isd_og_mou_6 isd_og_mou_7 isd_og_mou_8
spl_og_mou_diff spl_og_mou_6 spl_og_mou_7 spl_og_mou_8
total_og_mou_diff total_og_mou_6 total_og_mou_7 total_og_mou_8
loc_ic_mou_diff loc_ic_mou_6 loc_ic_mou_7 loc_ic_mou_8
std_ic_mou_diff std_ic_mou_6 std_ic_mou_7 std_ic_mou_8
isd_ic_mou_diff isd_ic_mou_6 isd_ic_mou_7 isd_ic_mou_8
spl_ic_mou_diff spl_ic_mou_6 spl_ic_mou_7 spl_ic_mou_8
total_ic_mou_diff total_ic_mou_6 total_ic_mou_7 total_ic_mou_8
total_rech_num_diff total_rech_num_6 total_rech_num_7 total_rech_num_8
total_rech_amt_diff total_rech_amt_6 total_rech_amt_7 total_rech_amt_8
max_rech_amt_diff max_re

In [41]:
def remove_outliers(arr, k=3):
    upper_limit = arr.mean() + k*arr.std()
    lower_limit = arr.mean() - k*arr.std()
    arr[arr<lower_limit] = lower_limit
    arr[arr>upper_limit] = upper_limit
    return arr

In [42]:
for col in df.columns:
    if df[col].dtype == np.float64:
        df[col] = remove_outliers(df[col])
df.shape

(69999, 169)

### High Value customers

In [43]:
# Data recharge amt
df['total_rech_amt_data_6'] = df['total_rech_data_6'] * df['av_rech_amt_data_6']
df['total_rech_amt_data_7'] = df['total_rech_data_7'] * df['av_rech_amt_data_7']
df['total_rech_amt_d_6'] = df['total_rech_amt_6'] + df['total_rech_amt_data_6']
df['total_rech_amt_d_7'] = df['total_rech_amt_7'] + df['total_rech_amt_data_7']
df['av_rech_amt_data_6_7'] = (df['total_rech_amt_d_6'] + df['total_rech_amt_d_7'])/2

In [44]:
df['av_rech_amt_data_6_7'].quantile(0.7)

477.0

In [45]:
dfh = df.loc[df['av_rech_amt_data_6_7'] >= df['av_rech_amt_data_6_7'].quantile(0.7), :]
dfh.shape

(21011, 174)

In [46]:
dfh['churn_probability'].value_counts() / dfh.shape[0]

0   0.9175
1   0.0825
Name: churn_probability, dtype: float64

In [47]:
dfh = dfh.drop(['total_rech_amt_data_6', 'total_rech_amt_data_7', 
                'total_rech_amt_d_6', 'total_rech_amt_d_7', 'av_rech_amt_data_6_7'], axis=1)
dfh.shape

(21011, 169)

In [48]:
df = df.drop(['total_rech_amt_data_6', 'total_rech_amt_data_7', 
                'total_rech_amt_d_6', 'total_rech_amt_d_7', 'av_rech_amt_data_6_7'], axis=1)
df.shape

(69999, 169)

In [49]:
dfh1 = pd.get_dummies(dfh, columns=fb_night_cols, prefix=fb_night_cols, drop_first=True)
dfh1.shape

(21011, 175)

## EDA

In [50]:
# Perform EDA on dataframe df

## Models

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [52]:
# divide data into train and test
y = df['churn_probability']
X = df.drop("churn_probability", axis = 1)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 23, stratify = y)

In [54]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
# X_train, y_train = oversample.fit_resample(X_train, y_train)

In [55]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(62999, 168) (62999,) (7000, 168) (7000,)


In [56]:
# scaler = RobustScaler()
scaler = RobustScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)
pca = PCA()
pca.fit(X_train_sc)
i = 0
for ev in pca.explained_variance_ratio_.cumsum():
    print(f' {i+1}:  {ev*100.:.4f}')
    i+=1


 1:  55.8511
 2:  73.8783
 3:  80.5539
 4:  86.1554
 5:  90.2485
 6:  93.9379
 7:  95.1188
 8:  96.1297
 9:  96.9149
 10:  97.5583
 11:  98.1516
 12:  98.6631
 13:  98.9877
 14:  99.2314
 15:  99.3504
 16:  99.4660
 17:  99.5743
 18:  99.6641
 19:  99.7226
 20:  99.7580
 21:  99.7835
 22:  99.8082
 23:  99.8273
 24:  99.8450
 25:  99.8620
 26:  99.8742
 27:  99.8844
 28:  99.8935
 29:  99.9020
 30:  99.9100
 31:  99.9167
 32:  99.9223
 33:  99.9272
 34:  99.9320
 35:  99.9363
 36:  99.9403
 37:  99.9436
 38:  99.9469
 39:  99.9498
 40:  99.9528
 41:  99.9556
 42:  99.9583
 43:  99.9610
 44:  99.9637
 45:  99.9662
 46:  99.9686
 47:  99.9709
 48:  99.9731
 49:  99.9751
 50:  99.9766
 51:  99.9782
 52:  99.9794
 53:  99.9806
 54:  99.9818
 55:  99.9829
 56:  99.9839
 57:  99.9850
 58:  99.9860
 59:  99.9869
 60:  99.9877
 61:  99.9886
 62:  99.9894
 63:  99.9902
 64:  99.9909
 65:  99.9915
 66:  99.9921
 67:  99.9926
 68:  99.9931
 69:  99.9935
 70:  99.9939
 71:  99.9943
 72:  99.9947
 

In [57]:
finpca = PCA(100, random_state=23)
finpca.fit(X_train_sc)

PCA(n_components=100, random_state=23)

In [58]:
X_train_pca = finpca.transform(X_train_sc)
X_test_pca = finpca.transform(X_test_sc)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train_pca.shape, y_train.shape, X_test_pca.shape, y_test.shape)

(62999, 168) (62999,) (7000, 168) (7000,)
(62999, 100) (62999,) (7000, 100) (7000,)


In [59]:
def show_results(mdl,xtrn, ytrn, xtest, ytest):
    print(f'Train Score: {mdl.score(xtrn, ytrn):.4f} Test: {mdl.score(xtest, ytest):.4f}', )
    _y_pred = mdl.predict(xtest)
    print(confusion_matrix(ytest, _y_pred))
    print(classification_report(ytest, _y_pred, target_names=['Churn', 'No Churn']))
    
def predict_test(dfin, mdl, scl=None, ipca=None, csvf="rfpreds.csv"):
    dfc = dfin.copy()
    if scl:
        print(f"Scaling")
        dfc = scl.transform(dfc)
    if ipca:
        print("PCA")
        dfc = ipca.transform(dfc)
    preds = mdl.predict(dfc)
    _dfr1 = pd.DataFrame()
    _dfr1['id'] = dftid['id']
    _dfr1['churn_probability'] = preds
    print(_dfr1['churn_probability'].value_counts() / _dfr1.shape[0])
    _dfr1.to_csv(csvf, index=False)
    return preds

In [61]:
from lightgbm import LGBMClassifier
lgbc = LGBMClassifier(n_estimators=1200,
                      boosting_type="goss", 
                      max_depth = -1,
                      learning_rate = 0.12,
                      min_child_samples=5,
                      class_weight = "balanced",
                      subsample = 0.7,
                      random_state = 23,
                      n_jobs = N_JOBS
                     )
lgbc.fit(X_train, y_train) 
show_results(lgbc, X_train, y_train, X_test, y_test)
plg = predict_test(dft, lgbc, csvf='lgpreds2.csv')

Train Score: 0.9998 Test: 0.9420
[[6098  189]
 [ 217  496]]
              precision    recall  f1-score   support

       Churn       0.97      0.97      0.97      6287
    No Churn       0.72      0.70      0.71       713

    accuracy                           0.94      7000
   macro avg       0.84      0.83      0.84      7000
weighted avg       0.94      0.94      0.94      7000

0   0.9021
1   0.0979
Name: churn_probability, dtype: float64


In [62]:
rfc = RandomForestClassifier(n_estimators=1000, 
                             max_depth=15, class_weight="balanced",
                             min_samples_leaf=5,
                             random_state=23, 
                             n_jobs=N_JOBS,
                             verbose=1)
rfc.fit(X_train_pca, y_train)
show_results(rfc, X_train_pca, y_train, X_test_pca, y_test)


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    1.6s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    9.2s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:   21.7s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:   39.3s
[Parallel(n_jobs=10)]: Done 1000 out of 1000 | elapsed:   50.2s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    0.4s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 1000 out of 1000 | elapsed:    0.9s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elaps

Train Score: 0.9601 Test: 0.9049


[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    0.6s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    1.2s


[[5937  350]
 [ 316  397]]
              precision    recall  f1-score   support

       Churn       0.95      0.94      0.95      6287
    No Churn       0.53      0.56      0.54       713

    accuracy                           0.90      7000
   macro avg       0.74      0.75      0.75      7000
weighted avg       0.91      0.90      0.91      7000



[Parallel(n_jobs=10)]: Done 1000 out of 1000 | elapsed:    1.5s finished


In [63]:
rfca = RandomForestClassifier(n_estimators=1000, 
                             max_depth=21, class_weight="balanced_subsample",
                             min_samples_leaf=7,
                             random_state=23, 
                             n_jobs=N_JOBS,
                             verbose=1)
rfca.fit(X_train, y_train)
show_results(rfca, X_train, y_train, X_test, y_test)


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    1.3s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    6.8s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:   15.9s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:   28.4s
[Parallel(n_jobs=10)]: Done 1000 out of 1000 | elapsed:   36.3s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.3s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    1.3s
[Parallel(n_jobs=10)]: Done 1000 out of 1000 | elapsed:    1.7s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elaps

Train Score: 0.9697 Test: 0.9437


[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.4s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    1.0s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    1.8s


[[6082  205]
 [ 189  524]]
              precision    recall  f1-score   support

       Churn       0.97      0.97      0.97      6287
    No Churn       0.72      0.73      0.73       713

    accuracy                           0.94      7000
   macro avg       0.84      0.85      0.85      7000
weighted avg       0.94      0.94      0.94      7000



[Parallel(n_jobs=10)]: Done 1000 out of 1000 | elapsed:    2.2s finished


In [None]:
p = predict_test(dft, rfca, csvf='rfpreds3.csv')

In [64]:
lc = LogisticRegression(penalty="elasticnet", 
                        class_weight="balanced",
                        solver="saga",
                        l1_ratio=0.5,
                        random_state=23, 
                        n_jobs=N_JOBS,
                        verbose=1
                       )
lc.fit(X_train, y_train)
show_results(lc, X_train, y_train, X_test, y_test)

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


max_iter reached after 16 seconds
Train Score: 0.8604 Test: 0.8623
[[5451  836]
 [ 128  585]]
              precision    recall  f1-score   support

       Churn       0.98      0.87      0.92      6287
    No Churn       0.41      0.82      0.55       713

    accuracy                           0.86      7000
   macro avg       0.69      0.84      0.73      7000
weighted avg       0.92      0.86      0.88      7000



[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   15.7s finished


In [68]:
etca = ExtraTreesClassifier(n_estimators=1150, 
                             max_depth=20, class_weight="balanced",
                             min_samples_leaf=4,
                             random_state=23, 
                             n_jobs=N_JOBS,
                             verbose=1)
etca.fit(X_train, y_train)
show_results(etca, X_train, y_train, X_test, y_test)


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.5s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    3.0s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    7.1s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:   13.1s
[Parallel(n_jobs=10)]: Done 1150 out of 1150 | elapsed:   19.2s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.3s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    1.4s
[Parallel(n_jobs=10)]: Done 1150 out of 1150 | elapsed:    2.1s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elaps

Train Score: 0.9389 Test: 0.9164


[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.4s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    1.0s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    1.8s


[[5858  429]
 [ 156  557]]
              precision    recall  f1-score   support

       Churn       0.97      0.93      0.95      6287
    No Churn       0.56      0.78      0.66       713

    accuracy                           0.92      7000
   macro avg       0.77      0.86      0.80      7000
weighted avg       0.93      0.92      0.92      7000



[Parallel(n_jobs=10)]: Done 1150 out of 1150 | elapsed:    2.4s finished


In [None]:
# gbc = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05,
#                                  max_depth=5,min_samples_leaf=5,subsample=0.8,
#                                  random_state=23, verbose=1)
# gbc.fit(X_train, y_train)
# show_results(gbc, X_train, y_train, X_test, y_test)
# print(gbc.score(X_train, y_train), gbc.score(X_test, y_test))
# y_pred = gbc.predict(X_test)
# confusion_matrix(y_test, y_pred)