In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from lightgbm import LGBMClassifier, LGBMRegressor
#from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score
from IPython.display import display
import matplotlib.pyplot as plt
from collections import Counter
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
def gen_typical_prof(df_train, prof_ind):
    profession = df_train["Profession"].values
    prof_inds = np.where(profession==prof_ind)[0]
    
    # Create aggregated results for each feature
    gender = Counter(df_train["Gender"].values[prof_inds]).most_common(1)[0][0]
    ever_married = Counter(df_train["Ever_Married"].values[prof_inds]).most_common(1)[0][0]
    age = [i for i in df_train["Age"].values[prof_inds] if i!=-999]
    graduated = Counter(df_train["Graduated"].values[prof_inds]).most_common(1)[0][0]
    work_exp = [i for i in df_train["Work_Experience"].values[prof_inds] if i!=-999]
    spend_score = Counter(df_train["Spending_Score"].values[prof_inds]).most_common(1)[0][0]
    family_size = [i for i in df_train["Family_Size"].values[prof_inds] if i!=-999]
    var1 = Counter(df_train["Var_1"].values[prof_inds]).most_common(1)[0][0]

    # Finally return the vector
    return np.array([gender, ever_married, sum(age)/len(age), graduated, sum(work_exp)/len(work_exp), spend_score, sum(family_size)/len(family_size), var1])

In [3]:
# Loads the data
df_train = pd.read_csv("Train.csv")
df_test = pd.read_csv("Test.csv")

# Some global variables
fill_na_val = "JAJA"
seg_mapping = dict()

# Convert Numeric NaNs to -999
features = ["Age", "Work_Experience", "Family_Size"]
df_train[features] = df_train[features].fillna(-999)
df_test[features] = df_test[features].fillna(-999)

# NaN in gender could mean that person is a transgender and does not want to disclose
gender_mapping = {"Male": 0, "Female": 1}
df_train["Gender"] = df_train["Gender"].fillna(2)
df_test["Gender"] = df_test["Gender"].fillna(2)
for gender, label in gender_mapping.items():
    df_train["Gender"] = df_train["Gender"].replace(gender, label)
    df_test["Gender"] = df_test["Gender"].replace(gender, label)

# Nan in ever_married could mean that the person does not want to reveal
# In that case it gives us extra info about the person type
married_mapping = {"Yes": 0, "No": 1}
df_train["Ever_Married"] = df_train["Ever_Married"].fillna(2)
df_test["Ever_Married"] = df_test["Ever_Married"].fillna(2)
for married, label in married_mapping.items():
    df_train["Ever_Married"] = df_train["Ever_Married"].replace(married, label)
    df_test["Ever_Married"] = df_test["Ever_Married"].replace(married, label)
    
# Nan in graduated could mean that the person has not graduate and does not want to reveal
grad_mapping = {"Yes": 0, "No": 1}
df_train["Graduated"] = df_train["Graduated"].fillna(1)
df_test["Graduated"] = df_test["Graduated"].fillna(1)
for grad, label in grad_mapping.items():
    df_train["Graduated"] = df_train["Graduated"].replace(grad, label)
    df_test["Graduated"] = df_test["Graduated"].replace(grad, label)

# Nan in profession could mean that person is unemployed
prof_mapping = {"Artist": 0, "Doctor": 1, "Engineer": 2, "Entertainment": 3, "Executive": 4, "Healthcare": 5, "Homemaker": 6, "Lawyer": 7, "Marketing": 8}
df_train["Profession"] = df_train["Profession"].fillna(9)
df_test["Profession"] = df_test["Profession"].fillna(9)
for prof, label in prof_mapping.items():
    df_train["Profession"] = df_train["Profession"].replace(prof, label)
    df_test["Profession"] = df_test["Profession"].replace(prof, label)

# Spending score is nan means it could be low
ss_mapping = {"Low": 0, "Average": 1, "High": 2}
df_train["Spending_Score"] = df_train["Spending_Score"].fillna(ss_mapping["Low"])
df_test["Spending_Score"] = df_test["Spending_Score"].fillna(ss_mapping["Low"])
for ss, label in ss_mapping.items():
    df_train["Spending_Score"] = df_train["Spending_Score"].replace(ss, label)
    df_test["Spending_Score"] = df_test["Spending_Score"].replace(ss, label)

# NaN in Var1 is just another category
var1_mapping = {"Cat_1": 0, "Cat_2": 1, "Cat_3": 2, "Cat_4": 3, "Cat_5": 4, "Cat_6": 5, "Cat_7": 6}
df_train["Var_1"] = df_train["Var_1"].fillna(7)
df_test["Var_1"] = df_test["Var_1"].fillna(7)
for var1, label in var1_mapping.items():
    df_train["Var_1"] = df_train["Var_1"].replace(var1, label)
    df_test["Var_1"] = df_test["Var_1"].replace(var1, label)

# Finally label encode segmentation
seg_mapping = {"A": 0, "B": 1, "C": 2, "D": 3}
seg_mapping_rev = {0: "A", 1: "B", 2: "C", 3: "D"}
for seg, label in seg_mapping.items():
    df_train["Segmentation"] = df_train["Segmentation"].replace(seg, label)

# Based on EDA, bin the Age feature (Didn't help)
# 1 -> young, 2 -> middle-aged, 3 -> old, 4 -> retired and old
df_train["Age_group"] = [1 if i<=33 else 2 if i>33 and i<65 else 3 if i>=65 and i<74 else 4 for i in df_train["Age"].values]
df_test["Age_group"] = [1 if i<=33 else 2 if i>33 and i<65 else 3 if i>=65 and i<74 else 4 for i in df_test["Age"].values]

# Based on EDA, Create Lonely feature :P
df_train["Lonely"] = [1 if members<2 and age >= 60 and gender == 1 else 0 for members, age, gender in zip(df_train["Family_Size"].values, df_train["Age"].values, df_train["Gender"].values)]
df_test["Lonely"] = [1 if members<2 and age >= 60 and gender == 1 else 0 for members, age, gender in zip(df_test["Family_Size"].values, df_test["Age"].values, df_test["Gender"].values)]

# Bin the ID column and add as feature
est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')
df_train["Binned_ID"] = est.fit_transform(np.reshape(df_train["ID"].values, (-1,1)))
df_test["Binned_ID"] = est.transform(np.reshape(df_test["ID"].values, (-1,1)))

df_train.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,Age_group,Lonely,Binned_ID
0,462809,0,1,22,1,5,1.0,0,4.0,3,3,1,0,4.0
1,462643,1,0,38,0,2,-999.0,1,3.0,3,0,2,0,4.0
2,466315,1,0,67,0,2,1.0,0,1.0,5,1,3,1,8.0
3,461735,0,0,67,0,7,0.0,2,2.0,5,1,3,0,3.0
4,462669,1,0,40,0,3,-999.0,2,6.0,5,0,2,0,4.0


In [4]:
# Drop the columns specified
cols_to_remove = ["ID", "Segmentation"]
df_train_dum, Y = df_train.drop(cols_to_remove, axis=1), df_train[cols_to_remove[-1]].values
df_test_dum = df_test.drop(cols_to_remove[:-1], axis=1)

# Specify which features are to be treated as Categorical features
cat_feats = ['Gender', 'Ever_Married', 'Graduated', "Var_1", 'Profession', 'Age_group', 'Lonely', "Binned_ID"]
cat_feats_inds = [df_train_dum.columns.get_loc(c) for c in cat_feats]

In [5]:
from sklearn.linear_model import LogisticRegression
# The usual
outer_kfold = KFold(n_splits=5, random_state=27, shuffle=True)
final_scores = list()
for train, test in outer_kfold.split(df_train_dum):
    x_train, x_test = df_train_dum.iloc[train].values, df_train_dum.iloc[test].values
    y_train, y_test = Y[train], Y[test]
    
    model=LogisticRegression()
    model.fit(x_train, y_train)
    
    #model = LGBMClassifier(random_state=100, max_depth=3, n_estimators=200, learning_rate=0.1)
    #model.fit(x_train, y_train, categorical_feature=cat_feats_inds)
    preds = model.predict(x_test)
    
    final_scores.append(accuracy_score(y_test, preds))
    print("Score:", final_scores[-1])
print("\nAverage Score:", np.average(final_scores))

Score: 0.4684014869888476
Score: 0.4628252788104089
Score: 0.4634448574969021
Score: 0.4513329200247985
Score: 0.4463732176069436

Average Score: 0.45847555218558017


In [6]:
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC # "Support vector classifier" 
outer_kfold = KFold(n_splits=5, random_state=27, shuffle=True)
final_scores = list()
for train, test in outer_kfold.split(df_train_dum):
    x_train, x_test = df_train_dum.iloc[train].values, df_train_dum.iloc[test].values
    y_train, y_test = Y[train], Y[test]
    
    model= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 )  
    model.fit(x_train, y_train)  
     
   
    
   # model = XGBClassifier(random_state=27, eval_metric="logloss", max_depth=3, n_estimators=82)
   # model.fit(x_train, y_train)
    preds = model.predict(x_test)
    
    final_scores.append(accuracy_score(y_test, preds))
    print("Score:", final_scores[-1])
print("\nAverage Score:", np.average(final_scores))

Score: 0.4473358116480793
Score: 0.4392812887236679
Score: 0.4355638166047088
Score: 0.4426534407935524
Score: 0.44699318040917546

Average Score: 0.44236550763583676


In [7]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

outer_kfold = KFold(n_splits=5, random_state=27, shuffle=True)
final_scores = list()

# Generate the folds
for train, test in outer_kfold.split(df_train_dum):
    x_train, x_test = df_train_dum.iloc[train].values, df_train_dum.iloc[test].values
    y_train, y_test = Y[train], Y[test]
    
    # Fit a decision tree classifier
    # The point of this classifier is not to get best results but only to partition and map the data
    # So the only hyper-parameter you should touch is max_leaf_nodes
    # max_leaf_nodes not only defines the maximum leaf nodes in the decision tree but also the maximum partitions in the dataset
    dec_tree = DecisionTreeClassifier(random_state=27, max_leaf_nodes=2)
    dec_tree.fit(x_train, y_train)
    # Now we know which training sample belongs to which partition
    sample_indices = np.array(dec_tree.apply(x_train))
    
    # We now need to find the best weights for lightgbm and xgboost in every partition
    # We will do this by using a KFold CV strategy
    
    # This dictionary will hold the weight corresponding to each partition
    dynamic_weights = dict()
    # For every single partition in the sample space, perform the following steps
    for index in tqdm(list(Counter(sample_indices))):
        indices = np.where(sample_indices==index)[0]
        x, y = x_train[indices], y_train[indices]
        print("\nPartition =", index, "\n==================")
        
        # This variable consists of real numbers between 0 and 1 with a constant difference of 0.01
        # These values correspond to weights within the average
        weights = np.arange(0, 1, 0.01)
        # This variable will be used to find the weight corresponding to the best score
        scores_corr_wts = np.zeros(len(weights))

        # Perform CV to get the best weights for current partition
        kfold = KFold(n_splits=5, random_state=27, shuffle=True)
        for inner_train, inner_test in kfold.split(x):
            xx_train, xx_test = x[inner_train], x[inner_test]
            yy_train, yy_test = y[inner_train], y[inner_test]
            
            
            model = LGBMClassifier(random_state=100, max_depth=3, n_estimators=200, learning_rate=0.1)
            model.fit(xx_train, yy_train)
           #preds = model.predict(x_test)
            preds1=model.predict_proba(xx_test)

            base_model = [ (('lr',LogisticRegression())) ,('knn',KNeighborsClassifier(n_neighbors=5,metric='minkowski' ,p=2 ))]
            meta_model=LogisticRegression()
            stacking_model = StackingClassifier(estimators=base_model,final_estimator=meta_model, passthrough=True)
            stacking_model.fit(xx_train,yy_train)
            preds2=stacking_model.predict_proba(xx_test)
            
            # Each weight is evaluated by calculating the corresponding score
            for i in range(len(weights)):
                final_inner_preds = np.argmax(preds1*weights[i] + preds2*(1-weights[i]), axis=1)
                scores_corr_wts[i]+= accuracy_score(yy_test, final_inner_preds)
                
        # The best weight is the one with the highest CV score
        final_weight = weights[np.argmax(scores_corr_wts)]
        print("Final Weight", final_weight)
        # This weight is assigned to the current partition
        dynamic_weights[index] = final_weight
    
    # Once all partitions have been assigned weights, continue to create your best individual models
    model1 = LogisticRegression()
    model1.fit(x_train, y_train)

    model2 = KNeighborsClassifier()
    model2.fit(x_train, y_train)
    
    # Use the "apply" function offered by sklearn, again, to find out the partition id for each test sample
    test_map = np.array(dec_tree.apply(x_test))
    # This variable will contain all our predictions
    final_preds = np.zeros(len(x_test))
    for index in list(Counter(test_map)):
        # Take all test samples that belong to current partition
        indices = np.where(test_map == index)[0]
        xx_test = x_test[indices]
        # Get the weights corresponding to the current partition
        model_weights = dynamic_weights[index]
        # Blend the predictions using the correct dynamic weights
        out = model1.predict_proba(xx_test)*dynamic_weights[index] + model2.predict_proba(xx_test)*(1-dynamic_weights[index])
        # Save the prediction at the correct indices
        final_preds[indices] = np.argmax(out, axis=1)
    
    final_scores.append(accuracy_score(y_test, final_preds))
    print("\nScore:", final_scores[-1])
print("\nAverage Score:", np.average(final_scores))

  0%|          | 0/2 [00:00<?, ?it/s]


Partition = 2 
Final Weight 0.88

Partition = 1 
Final Weight 0.35000000000000003

Score: 0.4826517967781908


  0%|          | 0/2 [00:00<?, ?it/s]


Partition = 1 
Final Weight 0.34

Partition = 2 
Final Weight 0.9

Score: 0.4739776951672863


  0%|          | 0/2 [00:00<?, ?it/s]


Partition = 1 
Final Weight 0.34

Partition = 2 
Final Weight 0.62

Score: 0.46592317224287483


  0%|          | 0/2 [00:00<?, ?it/s]


Partition = 1 
Final Weight 0.77

Partition = 2 
Final Weight 0.71

Score: 0.4649721016738996


  0%|          | 0/2 [00:00<?, ?it/s]


Partition = 1 
Final Weight 0.48

Partition = 2 
Final Weight 0.65

Score: 0.46869187848729077

Average Score: 0.4712433288699084
