##  Janatahack: Customer Segmentation

### Solution 1:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error,accuracy_score
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold,KFold,GridSearchCV,GroupKFold,train_test_split,StratifiedShuffleSplit
from rfpimp import *
from tqdm import tqdm
from catboost import *
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder



In [None]:
df_train = pd.read_csv('Train_aBjfeNk.csv')
df_test = pd.read_csv('Test_LqhgPWU.csv')

In [None]:
df_train.sort_values(by='ID')

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
7238,458982,Male,Yes,61,Yes,Executive,1.0,High,3.0,Cat_6,C
5546,458983,Female,Yes,63,Yes,Executive,0.0,High,5.0,Cat_6,C
4373,458984,Male,Yes,39,Yes,Artist,0.0,Average,3.0,Cat_6,C
4695,458985,Male,No,23,No,Healthcare,1.0,Low,4.0,Cat_6,D
3333,458986,Male,No,18,No,Healthcare,7.0,Low,4.0,Cat_6,D
...,...,...,...,...,...,...,...,...,...,...,...
4791,467969,Female,Yes,43,Yes,Artist,0.0,Average,2.0,Cat_6,C
570,467971,Female,No,31,Yes,Artist,1.0,Low,4.0,Cat_6,D
5832,467972,Male,No,22,No,Artist,1.0,Low,3.0,Cat_6,D
4140,467973,Female,Yes,66,Yes,Engineer,0.0,Average,3.0,Cat_6,A


In [None]:
df_test[~df_test['ID'].isin(df_train['ID'].unique())]

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
6,459005,Male,Yes,61,Yes,Doctor,5.0,Low,3.0,Cat_6
19,459045,Female,Yes,88,Yes,Lawyer,1.0,Average,4.0,Cat_6
32,459090,Male,No,31,No,Artist,1.0,Low,2.0,Cat_6
38,459116,Male,Yes,60,Yes,Artist,2.0,Average,5.0,Cat_6
43,459121,Female,Yes,51,Yes,Artist,3.0,Average,6.0,Cat_6
...,...,...,...,...,...,...,...,...,...,...
2607,467913,Male,Yes,48,Yes,Artist,3.0,Low,1.0,Cat_6
2612,467930,Male,Yes,45,Yes,Entertainment,4.0,Average,2.0,Cat_6
2616,467938,Male,Yes,46,Yes,Entertainment,5.0,Low,1.0,Cat_2
2624,467960,Female,No,53,Yes,Entertainment,,Low,2.0,Cat_6


## LabelEncoding

In [None]:
gender_map = {
    'Male':0,
    'Female':1
}
Ever_Married = {
    'No':0,
    'Yes':1
}
grad = {
    'No':0,
    'Yes':1
}
spend_score ={
    'Low':0,
    'Average':1,
    'High':2
}
var1 ={
    'Low':0,
    'Average':1,
    'High':2
}
proff ={'Artist': 0,
 'Healthcare': 1,
 'Entertainment': 2,
 'Engineer':3,
 'Doctor': 4,
 'Lawyer': 5,
 'Executive': 6,
 'Marketing': 7,
 'Homemaker': 8}
cat_var = {'Cat_6': 5,
 'Cat_4': 3,
 'Cat_3': 2,
 'Cat_2': 1,
 'Cat_7': 6,
 'Cat_1': 0,
 'Cat_5': 4}

seg ={
    'A':0,
    'B':1,
    'C':2,
    'D':3
}
    
rev_seg ={
    0:'A',
    1:'B',
    2:'C',
    3:'D'
}
map_bins = {'35-45': 3,
 '25-35': 2,
 '45-55': 4,
 '65-100': 6,
 '0-25': 1,
 '55-65': 5}

In [None]:
def get_age_groups(x):
    if x <= 25:
        return '0-25'
    if x > 25 and x<=35:
        return '25-35'
    if x>35 and x<=45:
        return '35-45'
    if x>45 and x<=55:
        return '45-55'
    if x>55 and x<=65:
        return '55-65'
    else:
        return '65-100'

In [None]:
full_df = pd.concat([df_train,df_test])

In [None]:
le = LabelEncoder()
full_df["Prof+Grad"] = full_df["Profession"].fillna('-9999')+"_"+full_df["Graduated"].astype(str)
full_df["Prof+Grad"] = le.fit_transform(full_df["Prof+Grad"])

In [None]:
full_df['Gender'] = full_df['Gender'].map(gender_map)
full_df['Ever_Married'] = full_df['Ever_Married'].map(Ever_Married)
full_df['Graduated'] = full_df['Graduated'].map(grad)
full_df['Profession']= full_df['Profession'].map(proff)
full_df['Spending_Score'] = full_df['Spending_Score'].map(spend_score)
full_df['Var_1'] = full_df['Var_1'].map(cat_var)
full_df['Segmentation'] = full_df['Segmentation'].map(seg)

# Features:

In [None]:
full_df["Var1_Family"] = full_df["Var_1"] + full_df["Prof+Grad"]

In [None]:
temp = full_df.groupby(['Age']).agg({'Spending_Score':['count','mean','sum'],
                                   'Work_Experience':['count','sum','min','max','mean'],
                                   'Profession':['min','max','count'],
                                           'Graduated':['count'],
                                   'Ever_Married':['count'],
                                    'Gender':['count'], 
                                       'Family_Size':['count','sum','min','max'],
                                       'Age':['count'],
                                    'Var_1':['count','max','min']})
temp.columns = ['_'.join(x) for x in temp.columns]
full_df = pd.merge(full_df,temp,on=['Age'],how='left')

In [None]:
temp = full_df.groupby(['Profession']).agg({
                                       'Age':['count','sum','min','max']})
temp.columns = ['_Prof_'.join(x) for x in temp.columns]
full_df = pd.merge(full_df,temp,on=['Profession'],how='left')

In [None]:
full_df['Age_Bins'] = full_df['Age'].apply(lambda x:get_age_groups(x))
full_df['Age_Bins'] = full_df['Age_Bins'].map(map_bins)

In [None]:
for i in tqdm(range(1,10)):
#     full_df[f'next_target_{i}'] = full_df.sort_values(by='ID')['Segmentation'].fillna(method='ffill').shift(i).sort_index()
    full_df[f'prev_target_{i}'] = full_df.sort_values(by='ID')['Segmentation'].fillna(method='bfill').shift(-i).sort_index()
#     full_df[f'prev_age_{i}'] = full_df.sort_values(by='ID')
#     full_df[f'next_{i}_insects'] = full_df.sort_values(by='ID').groupby(['Age'])['Segmentation'].apply(lambda x: x.shift(i).ffill())
#     full_df[f'next_{i}_insects'] = full_df.sort_values(by='ID').groupby(['Age'])['Segmentation'].apply(lambda x: x.shift(-i).bfill())

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 33.70it/s]


In [None]:
# feats_new = []
# for i in tqdm(range(1, 10)):
#     full_df[f'magic_{i}'] = full_df.sort_values(by='ID')['Segmentation'].shift(-i).expanding().mean().fillna(method='bfill').sort_index()
#     feats_new.append(f'magic_{i}')

In [None]:
# full_df = pd.get_dummies(full_df,columns=['Profession','Var_1'])

In [None]:
full_df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,...,Age_Bins,prev_target_1,prev_target_2,prev_target_3,prev_target_4,prev_target_5,prev_target_6,prev_target_7,prev_target_8,prev_target_9
0,462809,0,0.0,22,0.0,1.0,1.0,0,4.0,3.0,...,1,2.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,3.0
1,462643,1,1.0,38,1.0,3.0,,1,3.0,3.0,...,3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0
2,466315,1,1.0,67,1.0,3.0,1.0,0,1.0,5.0,...,6,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0
3,461735,0,1.0,67,1.0,5.0,0.0,2,2.0,5.0,...,6,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0
4,462669,1,1.0,40,1.0,2.0,,2,6.0,5.0,...,3,0.0,0.0,3.0,3.0,1.0,0.0,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10690,467954,0,0.0,29,0.0,1.0,9.0,0,4.0,5.0,...,2,3.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
10691,467958,1,0.0,35,1.0,4.0,1.0,0,1.0,5.0,...,2,0.0,0.0,1.0,1.0,1.0,3.0,3.0,0.0,3.0
10692,467960,1,0.0,53,1.0,2.0,,0,2.0,5.0,...,4,1.0,1.0,3.0,3.0,0.0,3.0,0.0,2.0,2.0
10693,467961,0,1.0,47,1.0,6.0,1.0,2,5.0,3.0,...,4,1.0,3.0,3.0,0.0,3.0,0.0,2.0,2.0,3.0


In [None]:
# full_df['Gend_Marr'] = full_df['Gender'] + full_df['Ever_Married']
# full_df['Gend_Grad'] = full_df['Gender'] + full_df['Graduated']
# full_df['Ever_Grad'] = full_df['Ever_Married'] + full_df['Graduated']
# full_df['Grad_Spend'] = full_df['Graduated'] + full_df['Spending_Score']
# full_df['marr_Spend'] = full_df['Ever_Married'] + full_df['Spending_Score']
# full_df['Family_Size_Mul'] = full_df['Family_Size'] * full_df['Spending_Score']
# full_df['Family_Size_Add'] = full_df['Family_Size'] + full_df['Spending_Score']

In [None]:
df_train = full_df[full_df['Segmentation'].notnull()]
df_test = full_df[full_df['Segmentation'].isnull()]

In [None]:
def expanding_mean(df1,df2,df_test=None,target=None,cols=None):
    df_1 = df1.copy()
    df_2 = df2.copy()
    cumulative_sum = df_1.groupby(cols)[target].cumsum() - df_1[target]
    cumulative_count = df_1.groupby(cols).cumcount()
    df_1[cols + "_mean_target"] = cumulative_sum/cumulative_count
    vals =df_1.groupby(cols).agg({target:['mean']})
    vals.columns = [x[0] for x in vals.columns]
    vals.rename(columns={target:cols+'_mean_target'},inplace=True)
    df_2 = pd.merge(df_2,vals,on=cols,how='left')
    df_1.fillna(df_1[cols + "_mean_target"].mean(),inplace=True)
    df_2.fillna(df_2[cols + "_mean_target"].mean(),inplace=True)
    df_1.drop([cols],axis=1,inplace=True)
    df_2.drop([cols],axis=1,inplace=True)
    return df_1,df_2,vals

In [None]:
df_train,_,vals = expanding_mean(df_train,df_train,target='Segmentation',cols='Age')
df_test = pd.merge(df_test,vals,on='Age',how='left')

In [None]:
df_train_final = df_train.drop(['ID'],axis=1).drop_duplicates()

#### Making x,y

In [None]:
X_train = df_train_final.drop(['Segmentation'],axis=1)
y_train = df_train_final['Segmentation']

In [None]:
X_train

Unnamed: 0,Gender,Ever_Married,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Prof+Grad,Var1_Family,...,prev_target_1,prev_target_2,prev_target_3,prev_target_4,prev_target_5,prev_target_6,prev_target_7,prev_target_8,prev_target_9,Age_mean_target
0,0,0.0,0.0,1.000000,1.000000,0,4.0,3.0,18,21.0,...,2.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,3.0,1.542096
1,1,1.0,1.0,3.000000,1.542096,1,3.0,3.0,10,13.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,1.542096
2,1,1.0,1.0,3.000000,1.000000,0,1.0,5.0,10,15.0,...,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.542096
3,0,1.0,1.0,5.000000,0.000000,2,2.0,5.0,25,30.0,...,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.000000
4,1,1.0,1.0,2.000000,1.542096,2,6.0,5.0,13,18.0,...,0.0,0.0,3.0,3.0,1.0,0.0,0.0,1.0,2.0,1.542096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8063,0,0.0,0.0,1.542096,0.000000,0,7.0,0.0,0,0.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.773050
8064,0,0.0,0.0,6.000000,3.000000,0,4.0,3.0,15,18.0,...,3.0,3.0,3.0,3.0,0.0,0.0,3.0,3.0,3.0,1.228916
8065,1,0.0,1.0,1.000000,1.000000,0,1.0,5.0,19,24.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,1.0,1.779221
8066,1,0.0,1.0,1.000000,1.000000,0,4.0,5.0,19,24.0,...,1.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,1.0,1.846491


In [None]:
df_test

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,...,prev_target_1,prev_target_2,prev_target_3,prev_target_4,prev_target_5,prev_target_6,prev_target_7,prev_target_8,prev_target_9,Age_mean_target
0,458989,1,1.0,36,1.0,3.0,0.0,0,1.0,5.0,...,3.0,0.0,3.0,0.0,2.0,2.0,1.0,0.0,0.0,1.320574
1,458994,0,1.0,37,1.0,1.0,8.0,1,4.0,5.0,...,2.0,1.0,0.0,0.0,1.0,3.0,2.0,2.0,2.0,1.235043
2,458996,1,1.0,69,0.0,-9999.0,0.0,0,1.0,5.0,...,0.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0,3.0,1.238806
3,459000,0,1.0,59,0.0,6.0,11.0,2,2.0,5.0,...,2.0,2.0,2.0,3.0,2.0,2.0,2.0,3.0,3.0,1.329897
4,459001,1,0.0,19,0.0,7.0,-9999.0,0,4.0,5.0,...,2.0,3.0,2.0,2.0,2.0,3.0,3.0,2.0,1.0,2.768657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,0,0.0,29,0.0,1.0,9.0,0,4.0,5.0,...,3.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.726776
2623,467958,1,0.0,35,1.0,4.0,1.0,0,1.0,5.0,...,0.0,0.0,1.0,1.0,1.0,3.0,3.0,0.0,3.0,1.236000
2624,467960,1,0.0,53,1.0,2.0,-9999.0,0,2.0,5.0,...,1.0,1.0,3.0,3.0,0.0,3.0,0.0,2.0,2.0,1.297468
2625,467961,0,1.0,47,1.0,6.0,1.0,2,5.0,3.0,...,1.0,3.0,3.0,0.0,3.0,0.0,2.0,2.0,3.0,1.317365


In [None]:
X_train

Unnamed: 0,Gender,Ever_Married,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Prof+Grad,Var1_Family,...,prev_target_1,prev_target_2,prev_target_3,prev_target_4,prev_target_5,prev_target_6,prev_target_7,prev_target_8,prev_target_9,Age_mean_target
0,0,0.0,0.0,1.000000,1.000000,0,4.0,3.0,18,21.0,...,2.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,3.0,1.542096
1,1,1.0,1.0,3.000000,1.542096,1,3.0,3.0,10,13.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,1.542096
2,1,1.0,1.0,3.000000,1.000000,0,1.0,5.0,10,15.0,...,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.542096
3,0,1.0,1.0,5.000000,0.000000,2,2.0,5.0,25,30.0,...,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.000000
4,1,1.0,1.0,2.000000,1.542096,2,6.0,5.0,13,18.0,...,0.0,0.0,3.0,3.0,1.0,0.0,0.0,1.0,2.0,1.542096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8063,0,0.0,0.0,1.542096,0.000000,0,7.0,0.0,0,0.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.773050
8064,0,0.0,0.0,6.000000,3.000000,0,4.0,3.0,15,18.0,...,3.0,3.0,3.0,3.0,0.0,0.0,3.0,3.0,3.0,1.228916
8065,1,0.0,1.0,1.000000,1.000000,0,1.0,5.0,19,24.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,1.0,1.779221
8066,1,0.0,1.0,1.000000,1.000000,0,4.0,5.0,19,24.0,...,1.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,1.0,1.846491


In [None]:
df_test

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,...,prev_target_2,prev_target_3,prev_target_4,prev_target_5,prev_target_6,prev_target_7,prev_target_8,prev_target_9,Age_mean_target,Segmentation_pred
0,458989,1,1.0,36,1.0,3.0,0.0,0,1.0,5.0,...,0.0,3.0,0.0,2.0,2.0,1.0,0.0,0.0,1.320574,0
1,458994,0,1.0,37,1.0,1.0,8.0,1,4.0,5.0,...,1.0,0.0,0.0,1.0,3.0,2.0,2.0,2.0,1.235043,2
2,458996,1,1.0,69,0.0,-9999.0,0.0,0,1.0,5.0,...,1.0,3.0,2.0,2.0,2.0,2.0,2.0,3.0,1.238806,1
3,459000,0,1.0,59,0.0,6.0,11.0,2,2.0,5.0,...,2.0,2.0,3.0,2.0,2.0,2.0,3.0,3.0,1.329897,2
4,459001,1,0.0,19,0.0,7.0,-9999.0,0,4.0,5.0,...,3.0,2.0,2.0,2.0,3.0,3.0,2.0,1.0,2.768657,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,0,0.0,29,0.0,1.0,9.0,0,4.0,5.0,...,3.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.726776,3
2623,467958,1,0.0,35,1.0,4.0,1.0,0,1.0,5.0,...,0.0,1.0,1.0,1.0,3.0,3.0,0.0,3.0,1.236000,0
2624,467960,1,0.0,53,1.0,2.0,-9999.0,0,2.0,5.0,...,1.0,3.0,3.0,0.0,3.0,0.0,2.0,2.0,1.297468,0
2625,467961,0,1.0,47,1.0,6.0,1.0,2,5.0,3.0,...,3.0,3.0,0.0,3.0,0.0,2.0,2.0,3.0,1.317365,1


In [None]:
X_train.columns

Index(['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Work_Experience',
       'Spending_Score', 'Family_Size', 'Var_1', 'Prof+Grad', 'Var1_Family',
       'Spending_Score_count', 'Spending_Score_mean', 'Spending_Score_sum',
       'Work_Experience_count', 'Work_Experience_sum', 'Work_Experience_min',
       'Work_Experience_max', 'Work_Experience_mean', 'Profession_min',
       'Profession_max', 'Profession_count', 'Graduated_count',
       'Ever_Married_count', 'Gender_count', 'Family_Size_count',
       'Family_Size_sum', 'Family_Size_min', 'Family_Size_max', 'Age_count',
       'Var_1_count', 'Var_1_max', 'Var_1_min', 'Age_Prof_count',
       'Age_Prof_sum', 'Age_Prof_min', 'Age_Prof_max', 'Age_Bins',
       'prev_target_1', 'prev_target_2', 'prev_target_3', 'prev_target_4',
       'prev_target_5', 'prev_target_6', 'prev_target_7', 'prev_target_8',
       'prev_target_9', 'Age_mean_target'],
      dtype='object')

In [None]:
X_train.fillna(-9999,inplace=True)
df_test.fillna(-9999,inplace=True)

### Selected features using permutation importance

In [None]:
feats =['Spending_Score',
 'Spending_Score_mean',
 'Prof+Grad',
 'prev_target_1',
 'Family_Size',
 'Profession',
 'Age_Prof_sum',
 'Graduated',
 'prev_target_3',
 'Spending_Score_sum',
 'Var_1',
 'Work_Experience_mean',
 'Gender',
 'Work_Experience_sum',
 'Spending_Score_count',
 'Age_Bins',
 'Age_Prof_max',
 'prev_target_2',
 'prev_target_5',
 'Ever_Married',
 'prev_target_9',
 'Ever_Married_count']

## Cross Validation

In [None]:
splits = 10
folds =StratifiedKFold(n_splits=splits, random_state=22,shuffle=True)
# predictions_lgb = np.zeros((len(X_valid), 2))
oof_preds = np.zeros((len(df_test), 4))
feature_importance_df = pd.DataFrame()
feature_importance_df['Feature'] = X_train.columns
final_preds = []
random_state = [22,44,66,77,88,99,101]
counter = 0

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train.values,y_train)):
        print("Fold {}".format(fold_))
        X_trn,y_trn = X_train[feats].iloc[trn_idx],y_train.iloc[trn_idx]
        X_val,y_val = X_train[feats].iloc[val_idx],y_train.iloc[val_idx]
#         clf =KNeighborsClassifier(n_neighbors=4)
#         clf.fit(X_trn,y_trn)
        clf = lgb.LGBMClassifier(n_estimators=1000,max_depth=4,random_state=22)
#         clf = RandomForestClassifier()
#         clf = XGBClassifier(n_estimators=1000,max_depth=4,random_state=22)
        clf.fit(X_trn, y_trn,eval_set=[(X_trn, y_trn), (X_val, y_val)],verbose=0,
                eval_metric='multi_error',early_stopping_rounds=100)
        
        imp = importances(clf,X_val,y_val)
        imp.rename(columns={
            'Importance':f'Importance_{fold_}'
        },inplace=True)
        feature_importance_df = pd.merge(feature_importance_df,imp,on='Feature')
#         predictions += np.abs(clf.predict(X_valid)).reshape(-1,1)
        final_preds.append(accuracy_score(y_pred=clf.predict(X_val),y_true=y_val))
        
#         predictions_lgb += clf.predict_proba(X_valid[rf_k])
        oof_preds += clf.predict_proba(df_test[feats])
#         counter = counter + 1
oof_preds = oof_preds/splits
print(sum(final_preds)/splits)

Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
0.555032270363845


In [None]:
#KNN = 94.76
#LGB = 95.61
#XGB = 95.14
#RF = 95.33
#Ensemble ALL = 95.81

## Make predictions with each model and storing it

In [None]:
oof_rf = oof_preds

In [None]:
oof_knn = oof_preds

In [None]:
oof_lgb = oof_preds

In [None]:
oof_xgb = oof_preds

## Checking importance

In [None]:
feature_importance_df['Final'] = 0

In [None]:
for i in feature_importance_df.columns[1:]:
    feature_importance_df['Final']+=feature_importance_df[i] 

In [None]:
imp = feature_importance_df.sort_values(by='Final',ascending=False)[['Feature','Final']]

In [None]:
imp[imp['Final'] > 0.01]

Unnamed: 0,Feature,Final
4,Spending_Score,1.207244
9,Spending_Score_mean,0.788304
3,Profession,0.723865
7,Prof+Grad,0.56021
5,Family_Size,0.416415
17,prev_target_1,0.404076
2,Graduated,0.275222
12,Work_Experience_mean,0.272642
6,Var_1,0.198317
19,prev_target_3,0.190898


In [None]:
feats = imp[imp['Final'] > 0.01]['Feature'].to_list()

## Ensemble

### 4 models(RF,KNN,XGB,LGBM)

In [None]:
oof_preds_fin=oof_lgb*0.7 + oof_xgb*0.10 + oof_rf*0.10 + oof_knn*0.10

In [None]:
import pickle
with open('oof_fin.pkl','rb') as f:
    oof_final = pickle.load(f)
with open('preds_lgb2.pkl','rb') as f:
    preds_ids_best = pickle.load(f)

In [None]:
oof_preds_fin = oof_final*0.5+preds_ids*0.5

In [None]:
# with open('96.19fin.pkl','wb') as f:
#     pickle.dump(oof_preds_fin,f)

In [None]:
# oof_preds_fin = oof_preds_fin*0.2 + preds_ids_best*0.8

In [None]:
preds = [np.argmax(x) for x in oof_final]

In [None]:
df_sub = pd.read_csv('sample_submission_wyi0h0z.csv')

## USING LEAKAGE

In [None]:
df_sub = pd.merge(df_sub,df_train[['ID','Segmentation']],on='ID',how='left')
df_sub['Segmentation_y'].fillna(-999,inplace=True)
df_sub['Segmentation_pred'] = preds

In [None]:
def get_final_preds(seg_pred,seg_leak):
    if seg_leak == -999:
        return seg_pred
    else:
        return seg_leak

In [None]:
df_sub['Segmentation'] = df_sub.apply(lambda x: get_final_preds(x['Segmentation_pred'],x['Segmentation_y']),axis=1)

In [None]:
df_sub.drop(['Segmentation_x', 'Segmentation_y', 'Segmentation_pred'],axis=1,inplace=True)
df_sub['Segmentation'] = df_sub['Segmentation'].map(rev_seg)

In [None]:
df_sub['Segmentation'].value_counts(normalize=True)

D    0.280548
A    0.254663
C    0.236011
B    0.228778
Name: Segmentation, dtype: float64

In [None]:
df_train['Segmentation'].value_counts(normalize=True)

3.0    0.281111
0.0    0.244422
2.0    0.244175
1.0    0.230293
Name: Segmentation, dtype: float64

In [None]:
df_sub.to_csv('bestmodelensemble.csv',index=False)

# Solution 2:

In [None]:
!pip install rfpimp
!pip install catboost

In [None]:
import pandas as pd
from sklearn.cluster import KMeans,FeatureAgglomeration
pd.set_option('display.max_rows',200)
pd.set_option('display.max_columns',100)
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from rfpimp import *



In [None]:
df_train = pd.read_csv("Train_aBjfeNk.csv")
df_test = pd.read_csv("Test_LqhgPWU.csv")

In [None]:
display(df_test.isnull().sum())
print("-----------")
df_train.isnull().sum()

ID                   0
Gender               0
Ever_Married        50
Age                  0
Graduated           24
Profession          38
Work_Experience    269
Spending_Score       0
Family_Size        113
Var_1               32
dtype: int64

-----------


ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [None]:
df_train.Segmentation.value_counts(normalize = True)

D    0.281111
A    0.244422
C    0.244175
B    0.230293
Name: Segmentation, dtype: float64

In [None]:
catcols = []
for i in df_train.columns:
  if df_train[i].dtype == "object":
      catcols.append(i)

In [None]:
catcols

['Gender',
 'Ever_Married',
 'Graduated',
 'Profession',
 'Spending_Score',
 'Var_1',
 'Segmentation']

In [None]:
df_train[catcols] = df_train[catcols].fillna("NANO")
df_test[catcols[:-2]] = df_test[catcols[:-2]].fillna("NANO")

In [None]:
df_train.isnull().sum()

ID                   0
Gender               0
Ever_Married         0
Age                  0
Graduated            0
Profession           0
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1                0
Segmentation         0
dtype: int64

In [None]:
gender_map = {'Female': 1, 'Male': 0}
marriage_map = {'NANO': 99, 'No': 0, 'Yes': 1}
graduate_map = {'NANO': 99, 'No': 0, 'Yes': 1}
profession_map = {'Artist': 0,'Doctor': 1,'Engineer': 2,'Entertainment': 3,'Executive': 4,'Healthcare': 5,
                   'Homemaker': 6,'Lawyer': 7,'Marketing': 8,'NANO': 99}
spending_map = {'Average': 1, 'High': 2, 'Low': 0}
var_map = {'Cat_1': 1,'Cat_2': 2,'Cat_3': 3,'Cat_4': 4,'Cat_5': 5, 'Cat_6': 6, 'Cat_7': 7,'NANO': 99}
target_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

In [None]:
catcols

['Gender',
 'Ever_Married',
 'Graduated',
 'Profession',
 'Spending_Score',
 'Var_1',
 'Segmentation']

# MAP (Label Encode)

In [None]:
from sklearn.preprocessing import LabelEncoder

### Feature(Before LabelEncode)

In [None]:
le = LabelEncoder()
df_train["Prof+Grad"] = df_train["Profession"]+"_"+df_train["Graduated"].astype(str)
df_test["Prof+Grad"] = df_test["Profession"]+"_"+df_test["Graduated"].astype(str)

df_train["Prof+Grad"] = le.fit_transform(df_train["Prof+Grad"])
df_test["Prof+Grad"] = le.transform(df_test["Prof+Grad"])

In [None]:
df_train["Gender"] =  df_train["Gender"].map(gender_map)
df_train["Ever_Married"] =  df_train["Ever_Married"].map(marriage_map)
df_train["Graduated"] =  df_train["Graduated"].map(graduate_map)
df_train["Profession"] =  df_train["Profession"].map(profession_map)
df_train["Spending_Score"] =  df_train["Spending_Score"].map(spending_map)
df_train["Var_1"] =  df_train["Var_1"].map(var_map)
df_train["Segmentation"] =  df_train["Segmentation"].map(target_map)


#---------------------------------------------------------------------


df_test["Gender"] =  df_test["Gender"].map(gender_map)
df_test["Ever_Married"] =  df_test["Ever_Married"].map(marriage_map)
df_test["Graduated"] =  df_test["Graduated"].map(graduate_map)
df_test["Profession"] =  df_test["Profession"].map(profession_map)
df_test["Spending_Score"] =  df_test["Spending_Score"].map(spending_map)
df_test["Var_1"] =  df_test["Var_1"].map(var_map)

### Featue(After LabelEncode)

In [None]:
df_train["Var1_Family"] = df_train["Var_1"] + df_train["Prof+Grad"]
df_test["Var1_Family"] = df_test["Var_1"] + df_test["Prof+Grad"]

In [None]:
# df_train.groupby(["Var_1","Segmentation"]).size().to_frame()[:50]

In [None]:
df_train = df_train.apply(lambda x: x.fillna(x.mean()),axis=0)
df_test = df_test.apply(lambda x: x.fillna(x.mean()),axis=0)

## Checking Train and Test uniques

In [None]:
cols =  df_test.columns.tolist()
for col in cols:
  print('Total unique '+col  +' values in Train are {}'.format(df_train[col].nunique()))
  print('Total unique '+col  +' values in Test are {}'.format(df_test[col].nunique()))
  print('Common'+col +' values are {}'.format(len(list(set(df_train[col]) & set(df_test[col])))))
  print('**************************')

Total unique ID values in Train are 8068
Total unique ID values in Test are 2627
CommonID values are 2332
**************************
Total unique Gender values in Train are 2
Total unique Gender values in Test are 2
CommonGender values are 2
**************************
Total unique Ever_Married values in Train are 3
Total unique Ever_Married values in Test are 3
CommonEver_Married values are 3
**************************
Total unique Age values in Train are 67
Total unique Age values in Test are 67
CommonAge values are 67
**************************
Total unique Graduated values in Train are 3
Total unique Graduated values in Test are 3
CommonGraduated values are 3
**************************
Total unique Profession values in Train are 10
Total unique Profession values in Test are 10
CommonProfession values are 10
**************************
Total unique Work_Experience values in Train are 16
Total unique Work_Experience values in Test are 16
CommonWork_Experience values are 15
************

In [None]:
df_train = df_train.sort_values(by = "ID")
df_train

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,Prof+Grad,Var1_Family
7238,458982,0,1,61,1,4,1.0,2,3.0,6,2,14,20
5546,458983,1,1,63,1,4,0.0,2,5.0,6,2,14,20
4373,458984,0,1,39,1,0,0.0,1,3.0,6,2,2,8
4695,458985,0,0,23,0,5,1.0,0,4.0,6,3,16,22
3333,458986,0,0,18,0,5,7.0,0,4.0,6,3,16,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4791,467969,1,1,43,1,0,0.0,1,2.0,6,2,2,8
570,467971,1,0,31,1,0,1.0,0,4.0,6,3,2,8
5832,467972,0,0,22,0,0,1.0,0,3.0,6,3,1,7
4140,467973,1,1,66,1,2,0.0,1,3.0,6,0,8,14


## Features

In [None]:
full_df = pd.concat([df_train,df_test])
full_df = full_df.reset_index(drop=True)


In [None]:
full_df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,Prof+Grad,Var1_Family
0,458982,0,1,61,1,4,1.000000,2,3.0,6.0,2.0,14,20.0
1,458983,1,1,63,1,4,0.000000,2,5.0,6.0,2.0,14,20.0
2,458984,0,1,39,1,0,0.000000,1,3.0,6.0,2.0,2,8.0
3,458985,0,0,23,0,5,1.000000,0,4.0,6.0,3.0,16,22.0
4,458986,0,0,18,0,5,7.000000,0,4.0,6.0,3.0,16,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10690,467954,0,0,29,0,5,9.000000,0,4.0,6.0,,16,22.0
10691,467958,1,0,35,1,1,1.000000,0,1.0,6.0,,5,11.0
10692,467960,1,0,53,1,3,2.552587,0,2.0,6.0,,11,17.0
10693,467961,0,1,47,1,4,1.000000,2,5.0,4.0,,14,18.0


In [None]:
for i in range(1,10):
#     full_df[f'prev_target_{i}'] = full_df.sort_values(by='ID')['Segmentation'].fillna(method='ffill').shift(i).sort_index()
    full_df[f'prev_target_{i}'] = full_df.sort_values(by='ID')['Segmentation'].fillna(method='bfill').shift(-i).sort_index()
    # full_df[f'magic_{i}'] = full_df.sort_values(by='ID')['Segmentation'].shift(-i).expanding().mean().fillna(method='bfill').sort_index()
    # full_df[f'next_{i}_insects'] = full_df.sort_values(by='ID').groupby(['Age'])['Segmentation'].apply(lambda x: x.shift(i).ffill())

In [None]:
temp = full_df.groupby(['Age']).agg({'Spending_Score':['count','mean','sum'],
                                   'Work_Experience':['count','sum','min','max','mean'],
                                   'Profession':['min','max'],
                                       'Family_Size':['sum','min','max'],
                                       'Age':['count'],
                                    'Var_1':['count','max','min']})
temp.columns = ['_'.join(x) for x in temp.columns]
full_df = pd.merge(full_df,temp,on=['Age'],how='left')

In [None]:
temp = full_df.groupby(['Profession']).agg({
                                       'Age':['count','sum','min','max']})
temp.columns = ['_Prof_'.join(x) for x in temp.columns]
full_df = pd.merge(full_df,temp,on=['Profession'],how='left')

In [None]:
def get_age_groups(x):
    if x <= 25:
        return '0-25'
    if x > 25 and x<=35:
        return '25-35'
    if x>35 and x<=45:
        return '35-45'
    if x>45 and x<=55:
        return '45-55'
    if x>55 and x<=65:
        return '55-65'
    else:
        return '65-100'

In [None]:
map_bins = {'35-45': 3,
 '25-35': 2,
 '45-55': 4,
 '65-100': 6,
 '0-25': 1,
 '55-65': 5}
full_df['Age_Bins'] = full_df['Age'].apply(lambda x:get_age_groups(x))
full_df['Age_Bins'] = full_df['Age_Bins'].map(map_bins)

In [None]:
# import pickle
# with open('/content/drive/My Drive/Colab Notebooks/JanathaHack/Customer Segmentation/feats.pkl','wb') as f:
#     pickle.dump(full_df,f)

In [None]:
full_df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,Prof+Grad,Var1_Family,prev_target_1,prev_target_2,prev_target_3,prev_target_4,prev_target_5,prev_target_6,prev_target_7,prev_target_8,prev_target_9,Spending_Score_count,Spending_Score_mean,Spending_Score_sum,Work_Experience_count,Work_Experience_sum,Work_Experience_min,Work_Experience_max,Work_Experience_mean,Profession_min,Profession_max,Family_Size_sum,Family_Size_min,Family_Size_max,Age_count,Var_1_count,Var_1_max,Var_1_min,Age_Prof_count,Age_Prof_sum,Age_Prof_min,Age_Prof_max,Age_Bins
0,458982,0,1,61,1,4,1.000000,2,3.0,6.0,2.0,14,20.0,2.0,2.0,3.0,3.0,2.0,3.0,1.0,1.0,3.0,126,0.817460,103,126,229.133306,0.0,11.0,1.818518,0,99,347.876625,1.0,7.0,126,126,7.0,1.0,775,39617,18,89,5
1,458983,1,1,63,1,4,0.000000,2,5.0,6.0,2.0,14,20.0,2.0,3.0,3.0,2.0,3.0,1.0,1.0,3.0,0.0,110,0.836364,92,110,215.165501,0.0,14.0,1.956050,0,99,291.525624,1.0,8.0,110,110,7.0,1.0,775,39617,18,89,5
2,458984,0,1,39,1,0,0.000000,1,3.0,6.0,2.0,2,8.0,3.0,3.0,2.0,3.0,1.0,1.0,3.0,0.0,3.0,280,0.592857,166,280,978.970633,0.0,14.0,3.496324,0,99,715.426994,1.0,9.0,280,280,99.0,1.0,3318,153583,18,89,3
3,458985,0,0,23,0,5,1.000000,0,4.0,6.0,3.0,16,22.0,3.0,2.0,3.0,1.0,1.0,3.0,0.0,3.0,0.0,164,0.036585,6,164,304.074393,0.0,11.0,1.854112,0,99,659.400491,1.0,9.0,164,164,99.0,1.0,1750,46781,18,86,1
4,458986,0,0,18,0,5,7.000000,0,4.0,6.0,3.0,16,22.0,2.0,3.0,1.0,1.0,3.0,0.0,3.0,0.0,2.0,174,0.051724,9,174,407.118622,0.0,14.0,2.339762,0,99,687.728505,1.0,9.0,174,174,7.0,1.0,1750,46781,18,86,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10690,467954,0,0,29,0,5,9.000000,0,4.0,6.0,,16,22.0,3.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,244,0.217213,53,244,780.790449,0.0,14.0,3.199961,0,8,786.252371,1.0,9.0,244,244,99.0,1.0,1750,46781,18,86,2
10691,467958,1,0,35,1,1,1.000000,0,1.0,6.0,,5,11.0,0.0,1.0,1.0,3.0,3.0,3.0,0.0,3.0,0.0,321,0.514019,165,321,1160.836710,0.0,14.0,3.616314,0,99,817.127239,1.0,7.0,321,321,99.0,1.0,930,34894,18,89,2
10692,467960,1,0,53,1,3,2.552587,0,2.0,6.0,,11,17.0,1.0,3.0,3.0,3.0,0.0,3.0,0.0,2.0,2.0,199,0.788945,157,199,476.626980,0.0,14.0,2.395110,0,8,551.402249,1.0,7.0,199,199,99.0,1.0,1250,53679,18,89,4
10693,467961,0,1,47,1,4,1.000000,2,5.0,4.0,,14,18.0,3.0,3.0,0.0,3.0,0.0,2.0,2.0,3.0,3.0,217,0.737327,160,217,419.641046,0.0,13.0,1.933830,0,99,633.402249,1.0,7.0,217,217,99.0,1.0,775,39617,18,89,4


In [None]:
df_test = full_df[full_df["Segmentation"].isnull()]
df_test.drop("Segmentation",axis=1,inplace=True)
df_train = full_df[full_df["Segmentation"].notnull()]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
df_test

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Prof+Grad,Var1_Family,prev_target_1,prev_target_2,prev_target_3,prev_target_4,prev_target_5,prev_target_6,prev_target_7,prev_target_8,prev_target_9,Spending_Score_count,Spending_Score_mean,Spending_Score_sum,Work_Experience_count,Work_Experience_sum,Work_Experience_min,Work_Experience_max,Work_Experience_mean,Profession_min,Profession_max,Family_Size_sum,Family_Size_min,Family_Size_max,Age_count,Var_1_count,Var_1_max,Var_1_min,Age_Prof_count,Age_Prof_sum,Age_Prof_min,Age_Prof_max,Age_Bins
8068,458989,1,1,36,1,2,0.000000,0,1.0,6.0,8,14.0,1.0,3.0,0.0,3.0,0.0,2.0,2.0,1.0,0.0,277,0.472924,131,277,1049.268026,0.0,14.0,3.787971,0,99,668.800983,1.0,7.0,277,277,99.0,1.0,935,38847,18,81,3
8069,458994,0,1,37,1,5,8.000000,1,4.0,6.0,17,23.0,2.0,1.0,0.0,1.0,1.0,3.0,2.0,2.0,2.0,304,0.549342,167,304,991.745602,0.0,14.0,3.262321,0,99,740.077749,1.0,9.0,304,304,99.0,1.0,1750,46781,18,86,3
8070,458996,1,1,69,0,99,0.000000,0,1.0,6.0,28,34.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,92,0.858696,79,92,116.612914,0.0,9.0,1.267532,0,99,224.201124,1.0,9.0,92,92,99.0,1.0,162,6952,18,88,6
8071,459000,0,1,59,0,4,11.000000,2,2.0,6.0,13,19.0,2.0,2.0,3.0,2.0,2.0,2.0,3.0,3.0,2.0,128,0.789062,101,128,271.163469,0.0,11.0,2.118465,0,99,336.225869,1.0,6.0,128,128,7.0,1.0,775,39617,18,89,5
8072,459001,1,0,19,0,8,2.552587,0,4.0,6.0,25,31.0,2.0,3.0,2.0,2.0,2.0,3.0,3.0,2.0,1.0,184,0.043478,8,184,452.493057,0.0,13.0,2.459201,0,99,737.326606,1.0,9.0,184,184,99.0,1.0,403,14985,18,89,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10690,467954,0,0,29,0,5,9.000000,0,4.0,6.0,16,22.0,3.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,244,0.217213,53,244,780.790449,0.0,14.0,3.199961,0,8,786.252371,1.0,9.0,244,244,99.0,1.0,1750,46781,18,86,2
10691,467958,1,0,35,1,1,1.000000,0,1.0,6.0,5,11.0,0.0,1.0,1.0,3.0,3.0,3.0,0.0,3.0,0.0,321,0.514019,165,321,1160.836710,0.0,14.0,3.616314,0,99,817.127239,1.0,7.0,321,321,99.0,1.0,930,34894,18,89,2
10692,467960,1,0,53,1,3,2.552587,0,2.0,6.0,11,17.0,1.0,3.0,3.0,3.0,0.0,3.0,0.0,2.0,2.0,199,0.788945,157,199,476.626980,0.0,14.0,2.395110,0,8,551.402249,1.0,7.0,199,199,99.0,1.0,1250,53679,18,89,4
10693,467961,0,1,47,1,4,1.000000,2,5.0,4.0,14,18.0,3.0,3.0,0.0,3.0,0.0,2.0,2.0,3.0,3.0,217,0.737327,160,217,419.641046,0.0,13.0,1.933830,0,99,633.402249,1.0,7.0,217,217,99.0,1.0,775,39617,18,89,4


In [None]:
x = df_train.drop(["Segmentation","ID"],axis=1)
df_test = df_test.drop(["ID"],axis=1)
y = df_train["Segmentation"]

### Train and predict(Baseline)

In [None]:
from lightgbm import LGBMClassifier
# lgb = LGBMClassifier(n_estimators = 300,class_weight = 'balanced',learning_rate=0.3,
#                     max_features = .6,max_depth = 20,random_state=27)
lgb = LGBMClassifier(random_state=27)
lgb.fit(x,y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=27, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
preds= lgb.predict(df_test)

In [None]:
catcols

['Gender',
 'Ever_Married',
 'Graduated',
 'Profession',
 'Spending_Score',
 'Var_1',
 'Segmentation']

In [None]:
kfold, scores = StratifiedKFold(n_splits=10, shuffle=True, random_state=27), []

oof = []
feature_importance_df = pd.DataFrame()
feature_importance_df['Feature'] = x.columns
final_preds = []

for n_fold,(train_, test_) in enumerate(kfold.split(x,y)):
    x_train, x_test = x.iloc[train_], x.iloc[test_]
    y_train, y_test = y.iloc[train_], y.iloc[test_]
    model = LGBMClassifier(n_estimators = 300,class_weight = 'balanced',learning_rate=0.3,
                    max_features = .6,max_depth = 30,random_state=27)
    # model = CatBoostClassifier(random_state=27,verbose = 0)
    model.fit(x_train,y_train)
    imp_ = importances(model,x_test,y_test)
    imp_.rename(columns={
        'Importance':f'Importance_{n_fold}'
    },inplace=True)
    feature_importance_df = pd.merge(feature_importance_df,imp_,on='Feature')
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    scores.append(score)
    oof.append(model.predict_proba(df_test))
    print(score)
print(str(model).split("(")[0], sum(scores)/len(scores))

0.5142503097893433
0.516728624535316
0.5080545229244114
0.5105328376703842
0.5254027261462205
0.5229244114002478
0.5117719950433705
0.49566294919454773
0.5421836228287841
0.5397022332506204
LGBMClassifier 0.5187214232783246


In [None]:
# CatBoost
#----------------------
# stratified kfold -  0.5353273620092185 LB:95.5
#                     0.5580108603072986 LB:95.33
#                     0.5559026016155169 LB:95.33
# only kfold -  0.55391948859391 LB:95.5
#               0.5400361600265665 - no early stop LB:95.80
# LGBM
#----------------------
# only kfold - 0.5349273720092172 LB:95.809
# Stratified kfold - 0.5269021680641779 LB:96.28

In [None]:
feature_importance_df['Final'] = 0
for i in feature_importance_df.columns[1:]:
    feature_importance_df['Final']+=feature_importance_df[i] 
imp = feature_importance_df.sort_values(by='Final',ascending=False)[['Feature','Final']]

In [None]:
imp[imp['Final']<0]

Unnamed: 0,Feature,Final
20,Spending_Score_count,-0.002472
28,Profession_min,-0.002478
29,Profession_max,-0.002506
16,prev_target_6,-0.017339
14,prev_target_4,-0.01737
39,Age_Prof_min,-0.01982
17,prev_target_7,-0.057026


In [None]:
feats = imp[imp['Final']>=0]['Feature'].to_list()

In [None]:
finalpreds = np.mean(oof,0)

In [None]:
# import pickle
# # with open('preds.pkl','wb') as f:
# #     pickle.dump(finalpreds,f)
# with open('preds.pkl', 'rb') as f:
#     finalpreds = pickle.load(f)

In [None]:
finalpreds = [np.argmax(x) for x in finalpreds]

In [None]:
sub = pd.read_csv("sample_submission_wyi0h0z.csv")
sub

Unnamed: 0,ID,Segmentation
0,458989,A
1,458994,A
2,458996,A
3,459000,A
4,459001,A
...,...,...
2622,467954,A
2623,467958,A
2624,467960,A
2625,467961,A


In [None]:
# sub["Segmentation"] = preds
sub["Segmentation"] = finalpreds

In [None]:
display(df_train["Segmentation"].value_counts(normalize=True))
print("--------------------")
sub["Segmentation"].value_counts(normalize=True)

3.0    0.281111
0.0    0.244422
2.0    0.244175
1.0    0.230293
Name: Segmentation, dtype: float64

--------------------


3    0.301485
0    0.262657
2    0.237153
1    0.198706
Name: Segmentation, dtype: float64

## USING LEAKAGE

In [None]:
df_test2 = pd.read_csv('Test_LqhgPWU.csv')
commonIDlist = list(set(df_train["ID"]) & set(df_test2["ID"]))
commonIDlist = pd.DataFrame(commonIDlist,columns=["ID"])
knowntest = commonIDlist.merge(df_train[["ID","Segmentation"]],on="ID",how="left")

trysub = sub.merge(knowntest,on="ID",how="outer")
trysub = trysub.fillna(999)
def data(a,b):
  if a == 999:
    return b
  else:
    return a
for i in range(len(trysub)):
  sub.at[i,'Segmentation'] = data(trysub["Segmentation_y"][i],trysub["Segmentation_x"][i])

In [None]:
display(df_train["Segmentation"].value_counts(normalize=True))
print("--------------------")
sub["Segmentation"].value_counts(normalize=True)

3.0    0.281111
0.0    0.244422
2.0    0.244175
1.0    0.230293
Name: Segmentation, dtype: float64

--------------------


3    0.281690
0    0.254663
2    0.235630
1    0.228017
Name: Segmentation, dtype: float64

In [None]:
reversed_dictionary = {value : key for (key, value) in target_map.items()}
reversed_dictionary

{0: 'A', 1: 'B', 2: 'C', 3: 'D'}

In [None]:
sub["Segmentation"] = sub["Segmentation"].map(reversed_dictionary)

In [None]:
sub.to_csv("submit.csv",index=False)

In [None]:
## Final Ensembel

In [None]:
import pickle
import numpy as np
import pandas as pd
#LGBM predictions LB(0.9628)
with open('preds_lgb.pkl', 'rb') as f:
    preds_id_lgb_best = pickle.load(f)
#CB predictions LB(0.958)
with open('preds_cb.pkl', 'rb') as f:
    preds_id_cb_best = pickle.load(f)
#Ensemble of Krithik's LGBM(kfold-0.958) and Karan's Ensemble(0.958) = It gave result 0.96000
with open('96fin.pkl', 'rb') as f:
    preds_96 = pickle.load(f)
#Ensemble of Krithik's LGBM(Stratkfold - 0.9628) and Karan's Ensemble(0.958) - It gave 0.9619
with open('96.19fin.pkl', 'rb') as f:
    preds_9619 = pickle.load(f)
#Karan's Ensemble (lgb,rf,xgb,knn) = 0.958
with open('oof_fin.pkl', 'rb') as f:
    preds_9581= pickle.load(f)

In [None]:
result1 = preds_96*0.2 + preds_id_lgb_best*0.8
result2 = preds_id_cb_best*0.1 + result1*0.9
finalpreds = [np.argmax(x) for x in result2]

## Using Leakage (Final)

In [None]:
sub = pd.read_csv("sample_submission_wyi0h0z.csv")
df_train = pd.read_csv("Train_aBjfeNk.csv")
target_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
df_train["Segmentation"] =  df_train["Segmentation"].map(target_map)

df_test = pd.read_csv("Test_LqhgPWU.csv")
sub["Segmentation"] = finalpreds

In [None]:
print("TRAIN")
display(df_train["Segmentation"].value_counts(normalize=True))
print("--------------------")
print("TEST")
sub["Segmentation"].value_counts(normalize=True)

TRAIN


3    0.281111
0    0.244422
2    0.244175
1    0.230293
Name: Segmentation, dtype: float64

--------------------
TEST


3    0.301865
0    0.265702
2    0.235630
1    0.196802
Name: Segmentation, dtype: float64

In [None]:
commonIDlist = list(set(df_train["ID"]) & set(df_test2["ID"]))
commonIDlist = pd.DataFrame(commonIDlist,columns=["ID"])
knowntest = commonIDlist.merge(df_train[["ID","Segmentation"]],on="ID",how="left")

trysub = sub.merge(knowntest,on="ID",how="outer")
trysub = trysub.fillna(999)
def data(a,b):
  if a == 999:
    return b
  else:
    return a
for i in range(len(trysub)):
  sub['Segmentation'][i] = data(trysub["Segmentation_y"][i],trysub["Segmentation_x"][i])

### Checking Distribution

In [None]:
print("TRAIN")
display(df_train["Segmentation"].value_counts(normalize=True))
print("--------------------")
print("TEST")
sub["Segmentation"].value_counts(normalize=True)

TRAIN


3    0.281111
0    0.244422
2    0.244175
1    0.230293
Name: Segmentation, dtype: float64

--------------------
TEST


3    0.282071
0    0.256566
2    0.234488
1    0.226875
Name: Segmentation, dtype: float64

In [None]:
reversed_dictionary = {value : key for (key, value) in target_map.items()}
sub["Segmentation"] = sub["Segmentation"].map(reversed_dictionary)

In [None]:
sub.to_csv("submit.csv",index=False)