# Load required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

# SEED Everything : for reproducibility 

In [2]:
SEED = 42

def seed_everything(seed = 42):
    import random, os
    import numpy as np

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Load dataset 

In [3]:
train_df = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Train.csv')
test_df = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Test.csv')
sample_sub = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Sample_submission.csv')

## inspect the dataset 

In [4]:
# train_df.info()

# Missing value analysis

In [5]:
missing_values_info = train_df.isnull().sum() / len(train_df)

In [6]:
missing_values_info_df = pd.DataFrame()
missing_values_info_df['features'] = missing_values_info.index
missing_values_info_df['missing_values'] = missing_values_info.values

In [7]:
px.bar(x='missing_values', y='features', data_frame=missing_values_info_df, title='Missing values in %', color='features')

# Missing Values Imputation

In [8]:
missing_features = missing_values_info_df[missing_values_info_df['missing_values']>0]['features'].values

In [9]:
missing_features

array(['GRINST', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'MIGSUN', 'PEFNTVTY',
       'PEMNTVTY', 'PENATVTY'], dtype=object)

In [10]:
for col in missing_features:
    train_df[col] = train_df[col].fillna('unknown')
    test_df[col] = test_df[col].fillna('unknown')

In [11]:
# train_df.info()
# test_df.info()

# Analysing the output variable

In [12]:
train_df['TARGET'].describe()

count    157509.000000
mean          0.082408
std           0.274986
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: TARGET, dtype: float64

In [13]:
income_greater_equal_to_50k = train_df['TARGET'].sum()
income_less_than_50k = train_df.shape[0] - income_greater_equal_to_50k
print(income_greater_equal_to_50k, income_less_than_50k)
print("% of people with salary greater than or equal to 50k", (100*income_greater_equal_to_50k/train_df.shape[0]).round(2), "%")

12980 144529
% of people with salary greater than or equal to 50k 8.24 %


# LabelEncoder 

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
for col in train_df.select_dtypes('object'):
    if col != 'ID':
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col])
        test_df[col] = le.transform(test_df[col])

In [16]:
# train_df.info()
# test_df.info()

# Prepare Train and Validation dataset

* Keeping 80% data for training and 20% for validation. 

* Using stratified approach to split the data. Stratified helps to keep distribution of target variable same for training and validation dataset. 

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = train_df.drop(columns=['ID','TARGET'])
print(X.shape)
y = train_df.TARGET
print(y.shape)

(157509, 40)
(157509,)


In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,
                                                        test_size=0.2, 
                                                        stratify=y, 
                                                        random_state=SEED)

In [20]:
X_train.shape, y_train.shape

((126007, 40), (126007,))

### target distribution in training 

In [21]:
y_train.value_counts()/len(y_train)

0    0.917592
1    0.082408
Name: TARGET, dtype: float64

In [22]:
X_valid.shape, y_valid.shape

((31502, 40), (31502,))

### target distribution in validation 

In [23]:
y_valid.value_counts()/len(y_valid)

0    0.917593
1    0.082407
Name: TARGET, dtype: float64

# Helper Function

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import catboost as cb

from sklearn.metrics import fbeta_score

In [25]:
def train_model(classifier, input_x, input_y):
    clf = classifier.fit(input_x, input_y)
    return clf

def evaluate_model(classifier, validation_x, validation_y, eval_metrics=fbeta_score):
    ypred = classifier.predict(validation_x)
    return ypred, eval_metrics(validation_y, ypred, beta=0.5)


# Comparing different ML algorithms

In [26]:
classifiers = {
#     'logistic_regression' : LogisticRegression(solver='liblinear', random_state=SEED),
#     'decision_tree' : DecisionTreeClassifier(random_state=SEED),
#     'random_forest': RandomForestClassifier(random_state=SEED),
#     'linear_svm': svm.SVC(kernel='linear',random_state=SEED),
#     'naive_bayes': GaussianNB(),
#     'k_neighbors': KNeighborsClassifier(),
    'LGBM': lgb.LGBMClassifier(objective="binary", random_state=SEED, n_estimators=250),
    'Gradient_Boosting_Classifier': GradientBoostingClassifier(random_state=SEED),
    'XGBoost' : XGBClassifier(random_state=SEED),
    'CatBoost' : cb.CatBoostClassifier(random_state=SEED)
    }

In [27]:
from collections import defaultdict

In [28]:
trained_models = {}
for classifier_name, classifier in classifiers.items():
#     print("Started for: ", classifier)
    model = train_model(classifier, X_train, y_train)
    ypred, validation_score = evaluate_model(model, X_valid, y_valid)
#     print("Done for: ", classifier)
    trained_models[classifier_name] = {'model': model, 'f1_score': validation_score}

Learning rate set to 0.081246
0:	learn: 0.5711160	total: 93.7ms	remaining: 1m 33s
1:	learn: 0.4879350	total: 124ms	remaining: 1m 1s
2:	learn: 0.4164204	total: 156ms	remaining: 51.7s
3:	learn: 0.3756840	total: 183ms	remaining: 45.5s
4:	learn: 0.3385133	total: 211ms	remaining: 42s
5:	learn: 0.3020995	total: 241ms	remaining: 39.9s
6:	learn: 0.2738372	total: 270ms	remaining: 38.3s
7:	learn: 0.2562095	total: 301ms	remaining: 37.3s
8:	learn: 0.2426776	total: 326ms	remaining: 35.9s
9:	learn: 0.2299725	total: 354ms	remaining: 35.1s
10:	learn: 0.2229403	total: 379ms	remaining: 34.1s
11:	learn: 0.2135409	total: 409ms	remaining: 33.7s
12:	learn: 0.2061069	total: 441ms	remaining: 33.5s
13:	learn: 0.2020128	total: 467ms	remaining: 32.9s
14:	learn: 0.1975049	total: 493ms	remaining: 32.4s
15:	learn: 0.1938144	total: 523ms	remaining: 32.1s
16:	learn: 0.1904087	total: 548ms	remaining: 31.7s
17:	learn: 0.1868059	total: 582ms	remaining: 31.7s
18:	learn: 0.1838788	total: 618ms	remaining: 31.9s
19:	learn: 

## Validation Results

In [29]:
validation_results = defaultdict(list)
for k,v in trained_models.items():
    validation_results['classifier_name'].append(k)
    validation_results['f1_score'].append(v['f1_score'])
validation_results = pd.DataFrame(validation_results)

In [30]:
validation_results

Unnamed: 0,classifier_name,f1_score
0,LGBM,0.684717
1,Gradient_Boosting_Classifier,0.642107
2,XGBoost,0.680881
3,CatBoost,0.69698


In [31]:
px.bar(x='f1_score', y='classifier_name', data_frame=validation_results, color="classifier_name", title='Algorithm Performance Comparison')

# Inference :

* Piciking the best model for inference which is CatBoost

In [32]:
XTest = test_df.drop(columns=['ID'])

In [33]:
ytest_pred = trained_models["CatBoost"]["model"].predict(XTest)

In [34]:
output = sample_sub.copy()
output['TARGET'] = ytest_pred

In [36]:
output.head()

Unnamed: 0,ID,TARGET
0,ai1kagv30p8v,0
1,9s9e3x6a8f7u,0
2,qlvd7mszxd2z,0
3,uwhbqcnx5a5z,0
4,27c5sqbrzdwf,1


In [35]:
output.to_csv('./solution.csv', index=False)