# Problem Understanding

### Census KDD dataset, a popular dataset for binary classification. The dataset contains information on individuals from the 1994 US Census database, and the task is to predict whether a given individual has a salary greater than or less than $50K per year.

# Load required libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

### This code imports necessary libraries for data analysis, such as pandas for data manipulation, numpy for numerical operations, and matplotlib and plotly for data visualization. 

# SEED Everything : for reproducibility 

In [4]:
SEED = 42

def seed_everything(seed = 42):
    import random, os
    import numpy as np

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

### This code sets a seed value of 42 for the random number generator, which ensures that the same random values are generated every time the code is run.

# Load dataset 

In [5]:
train_df = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Train.csv')
test_df = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Test.csv')
sample_sub = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Sample_submission.csv')

## inspect the dataset 

In [6]:
train_df.info()
train_df.describe().T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157509 entries, 0 to 157508
Data columns (total 42 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ID        157509 non-null  object
 1   AAGE      157509 non-null  int64 
 2   ACLSWKR   157509 non-null  object
 3   ADTIND    157509 non-null  int64 
 4   ADTOCC    157509 non-null  int64 
 5   AHGA      157509 non-null  object
 6   AHRSPAY   157509 non-null  int64 
 7   AHSCOL    157509 non-null  object
 8   AMARITL   157509 non-null  object
 9   AMJIND    157509 non-null  object
 10  AMJOCC    157509 non-null  object
 11  ARACE     157509 non-null  object
 12  AREORGN   157509 non-null  object
 13  ASEX      157509 non-null  object
 14  AUNMEM    157509 non-null  object
 15  AUNTYPE   157509 non-null  object
 16  AWKSTAT   157509 non-null  object
 17  CAPGAIN   157509 non-null  int64 
 18  CAPLOSS   157509 non-null  int64 
 19  DIVVAL    157509 non-null  int64 
 20  FILESTAT  157509 non-null 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AAGE,157509.0,39.816112,19.404058,0.0,26.0,38.0,52.0,90.0
ADTIND,157509.0,20.30844,18.18385,0.0,0.0,24.0,37.0,51.0
ADTOCC,157509.0,14.97183,14.890825,0.0,0.0,12.0,29.0,46.0
AHRSPAY,157509.0,72.889797,311.546534,0.0,0.0,0.0,0.0,9916.0
CAPGAIN,157509.0,577.585992,5403.024319,0.0,0.0,0.0,0.0,99999.0
CAPLOSS,157509.0,49.300015,311.199032,0.0,0.0,0.0,0.0,4608.0
DIVVAL,157509.0,264.806386,2282.967898,0.0,0.0,0.0,0.0,99999.0
NOEMP,157509.0,2.580913,2.400063,0.0,0.0,2.0,5.0,6.0
SEOTR,157509.0,0.234825,0.62965,0.0,0.0,0.0,0.0,2.0
VETYN,157509.0,1.834517,0.538978,0.0,2.0,2.0,2.0,2.0


### The train_df.info() function provides information about the DataFrame, such as the number of rows, columns, data types, and missing values.

### The train_df.describe().T function computes summary statistics for the numerical columns in the DataFrame and transposes the output for readability. This includes the count, mean, standard deviation, minimum, maximum, and quartile values for each column.

# Missing value analysis

In [39]:
missing_values_info = train_df.isnull().sum() / len(train_df)
missing_values_info_df = pd.DataFrame()
missing_values_info_df['features'] = missing_values_info.index
missing_values_info_df['missing_values'] = missing_values_info.values

### This code calculates the percentage of missing values in each column of a dataframe called "train_df". 

### It then creates a new dataframe called "missing_values_info_df" with two columns: "features" which contains the names of the columns in "train_df", and "missing_values" which contains the corresponding percentage of missing values. 

In [9]:
px.bar(x='missing_values', y='features', data_frame=missing_values_info_df, title='Missing values in %', color='features')

# Missing Values Imputation

In [10]:
missing_features = missing_values_info_df[missing_values_info_df['missing_values']>0]['features'].values

### This line of code selects the names of the features that have missing values from a DataFrame called "missing_values_info_df" and stores them in an array called "missing_features"
### It then selects the "features" column from these filtered rows and converts it into a NumPy array using the "values" method.

In [11]:
missing_features

array(['GRINST', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'MIGSUN', 'PEFNTVTY',
       'PEMNTVTY', 'PENATVTY'], dtype=object)

In [12]:
for col in missing_features:
    train_df[col] = train_df[col].fillna('unknown')
    test_df[col] = test_df[col].fillna('unknown')

### This code fills the missing values in columns specified by the "missing_features" variable with the string value "unknown" in both the "train_df" and "test_df" data frames.

In [40]:
# train_df.info()
# test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157509 entries, 0 to 157508
Data columns (total 42 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ID        157509 non-null  object
 1   AAGE      157509 non-null  int64 
 2   ACLSWKR   157509 non-null  int64 
 3   ADTIND    157509 non-null  int64 
 4   ADTOCC    157509 non-null  int64 
 5   AHGA      157509 non-null  int64 
 6   AHRSPAY   157509 non-null  int64 
 7   AHSCOL    157509 non-null  int64 
 8   AMARITL   157509 non-null  int64 
 9   AMJIND    157509 non-null  int64 
 10  AMJOCC    157509 non-null  int64 
 11  ARACE     157509 non-null  int64 
 12  AREORGN   157509 non-null  int64 
 13  ASEX      157509 non-null  int64 
 14  AUNMEM    157509 non-null  int64 
 15  AUNTYPE   157509 non-null  int64 
 16  AWKSTAT   157509 non-null  int64 
 17  CAPGAIN   157509 non-null  int64 
 18  CAPLOSS   157509 non-null  int64 
 19  DIVVAL    157509 non-null  int64 
 20  FILESTAT  157509 non-null 

# Analysing the output variable

In [14]:
train_df['TARGET'].describe()

count    157509.000000
mean          0.082408
std           0.274986
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: TARGET, dtype: float64

 The code computes descriptive statistics for the numerical column 'TARGET' in the DataFrame 'train_df'. 

> 1. The output shows that there are 157,509 non-null values in the 'TARGET' column. 
> 
> 1. The mean value of the column is 0.0824, indicating that on average, approximately 8% of the observations have a value of 1. 
>  
> 1. The standard deviation is 0.275, which means that the values in the column are spread out over a wide range. 
>  
> 1. The minimum value is 0, indicating that the majority of observations have a value of 0. 
>  
> 1. The 25th percentile value is also 0, which means that the majority of observations lie below this point. The 50th and 75th percentile values are also 0, indicating that the majority of observations have a value of 0. 
>  
> 1. The maximum value is 1, indicating that there are some observations with a value of 1 in the 'TARGET' column.

# LabelEncoder 

In [41]:
from sklearn.preprocessing import LabelEncoder
for col in train_df.select_dtypes('object'):
    if col != 'ID':
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col])
        test_df[col] = le.transform(test_df[col])

### Label encoding is a technique for encoding categorical variables as integers. Each unique value in a categorical column is assigned a unique integer value.

In [18]:
# train_df.info()
# test_df.info()

# Prepare Train and Validation dataset

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X = train_df.drop(columns=['ID','TARGET'])
print(X.shape)
y = train_df.TARGET
print(y.shape)

(157509, 40)
(157509,)


### Keeping 80% data for training and 20% for validation. 

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,
                                                        test_size=0.2, 
                                                        stratify=y, 
                                                        random_state=SEED)

In [22]:
X_train.shape, y_train.shape

((126007, 40), (126007,))

### target distribution in training 

In [23]:
y_train.value_counts()/len(y_train)

0    0.917592
1    0.082408
Name: TARGET, dtype: float64

In [24]:
X_valid.shape, y_valid.shape

((31502, 40), (31502,))

### target distribution in validation 

In [25]:
y_valid.value_counts()/len(y_valid)

0    0.917593
1    0.082407
Name: TARGET, dtype: float64

# Helper Function

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import catboost as cb

from sklearn.metrics import fbeta_score

In [27]:
def train_model(classifier, input_x, input_y):
    clf = classifier.fit(input_x, input_y)
    return clf

def evaluate_model(classifier, validation_x, validation_y, eval_metrics=fbeta_score):
    ypred = classifier.predict(validation_x)
    return ypred, eval_metrics(validation_y, ypred, beta=0.5)


# Comparing different ML algorithms

In [28]:
classifiers = {
#     'logistic_regression' : LogisticRegression(solver='liblinear', random_state=SEED),
#     'decision_tree' : DecisionTreeClassifier(random_state=SEED),
#     'random_forest': RandomForestClassifier(random_state=SEED),
#     'linear_svm': svm.SVC(kernel='linear',random_state=SEED),
#     'naive_bayes': GaussianNB(),
#     'k_neighbors': KNeighborsClassifier(),
    'LGBM': lgb.LGBMClassifier(objective="binary", random_state=SEED, n_estimators=250),
    'Gradient_Boosting_Classifier': GradientBoostingClassifier(random_state=SEED),
    'XGBoost' : XGBClassifier(random_state=SEED),
    'CatBoost' : cb.CatBoostClassifier(random_state=SEED)
    }

In [30]:
from collections import defaultdict
trained_models = {}
for classifier_name, classifier in classifiers.items():
#     print("Started for: ", classifier)
    model = train_model(classifier, X_train, y_train)
    ypred, validation_score = evaluate_model(model, X_valid, y_valid)
#     print("Done for: ", classifier)
    trained_models[classifier_name] = {'model': model, 'f1_score': validation_score}

Learning rate set to 0.081246
0:	learn: 0.5711160	total: 98ms	remaining: 1m 37s
1:	learn: 0.4879350	total: 131ms	remaining: 1m 5s
2:	learn: 0.4164204	total: 170ms	remaining: 56.4s
3:	learn: 0.3756840	total: 194ms	remaining: 48.4s
4:	learn: 0.3385133	total: 225ms	remaining: 44.7s
5:	learn: 0.3020995	total: 256ms	remaining: 42.4s
6:	learn: 0.2738372	total: 285ms	remaining: 40.5s
7:	learn: 0.2562095	total: 320ms	remaining: 39.7s
8:	learn: 0.2426776	total: 348ms	remaining: 38.3s
9:	learn: 0.2299725	total: 379ms	remaining: 37.5s
10:	learn: 0.2229403	total: 403ms	remaining: 36.3s
11:	learn: 0.2135409	total: 433ms	remaining: 35.6s
12:	learn: 0.2061069	total: 469ms	remaining: 35.6s
13:	learn: 0.2020128	total: 495ms	remaining: 34.9s
14:	learn: 0.1975049	total: 522ms	remaining: 34.3s
15:	learn: 0.1938144	total: 553ms	remaining: 34s
16:	learn: 0.1904087	total: 580ms	remaining: 33.5s
17:	learn: 0.1868059	total: 615ms	remaining: 33.6s
18:	learn: 0.1838788	total: 651ms	remaining: 33.6s
19:	learn: 0.

## Validation Results

In [31]:
validation_results = defaultdict(list)
for k,v in trained_models.items():
    validation_results['classifier_name'].append(k)
    validation_results['f1_score'].append(v['f1_score'])
validation_results = pd.DataFrame(validation_results)

In [32]:
validation_results

Unnamed: 0,classifier_name,f1_score
0,LGBM,0.684717
1,Gradient_Boosting_Classifier,0.642107
2,XGBoost,0.680881
3,CatBoost,0.69698


In [33]:
px.bar(x='f1_score', y='classifier_name', data_frame=validation_results, color="classifier_name", title='Algorithm Performance Comparison')

# Inference

### CatBoost is the best performing classifier among the four based on the f1_score.

In [34]:
XTest = test_df.drop(columns=['ID'])

In [35]:
ytest_pred = trained_models["CatBoost"]["model"].predict(XTest)

In [36]:
output = sample_sub.copy()
output['TARGET'] = ytest_pred

In [37]:
output.head()

Unnamed: 0,ID,TARGET
0,ai1kagv30p8v,0
1,9s9e3x6a8f7u,0
2,qlvd7mszxd2z,0
3,uwhbqcnx5a5z,0
4,27c5sqbrzdwf,1


In [38]:
output.to_csv('./solution.csv', index=False)