In [1]:
# Uncomment following lines to install plotly if it's not installed 

!pip install plotly
!pip install --upgrade nbformat

Collecting nbformat
  Downloading nbformat-5.7.1-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.2/77.2 kB[0m [31m616.2 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: nbformat
  Attempting uninstall: nbformat
    Found existing installation: nbformat 5.4.0
    Uninstalling nbformat-5.4.0:
      Successfully uninstalled nbformat-5.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
beatrix-jupyterlab 3.1.7 requires google-cloud-bigquery-storage, which is not installed.[0m[31m
[0mSuccessfully installed nbformat-5.7.1
[0m

# Load required libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
import plotly.express as px

# SEED Everything : for reproducibility 

In [3]:
SEED = 42

def seed_everything(seed = 42):
    import random, os
    import numpy as np
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(SEED)

# Load dataset 

In [4]:
train_df = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Train.csv')
test_df = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Test.csv')
sample_sub = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Sample_submission.csv')

In [5]:
train_df.head()

Unnamed: 0,ID,AAGE,ACLSWKR,ADTIND,ADTOCC,AHGA,AHRSPAY,AHSCOL,AMARITL,AMJIND,...,PEFNTVTY,PEMNTVTY,PENATVTY,PRCITSHP,SEOTR,VETQVA,VETYN,WKSWORK,YEAR,TARGET
0,shydl6zxik4o,22,Not in universe,0,0,High school graduate,0,Not in universe,Divorced,Not in universe or children,...,United-States,United-States,United-States,Native- Born in the United States,2,Not in universe,2,36,1995,0
1,2dhm421r62rr,37,Private,35,33,Some college but no degree,0,Not in universe,Divorced,Finance insurance and real estate,...,United-States,United-States,United-States,Native- Born in the United States,2,Not in universe,2,47,1995,0
2,ys7nzza2hffj,83,Not in universe,0,0,1st 2nd 3rd or 4th grade,0,Not in universe,Widowed,Not in universe or children,...,Puerto-Rico,Puerto-Rico,Puerto-Rico,Native- Born in Puerto Rico or U S Outlying,0,Not in universe,2,0,1995,0
3,iqrpxh3sr7n1,46,Private,32,35,High school graduate,0,Not in universe,Married-civilian spouse present,Wholesale trade,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,1995,0
4,rr8vlzk9iwyn,35,Private,34,26,Associates degree-academic program,0,Not in universe,Married-civilian spouse present,Finance insurance and real estate,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,45,1995,0


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157509 entries, 0 to 157508
Data columns (total 42 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ID        157509 non-null  object
 1   AAGE      157509 non-null  int64 
 2   ACLSWKR   157509 non-null  object
 3   ADTIND    157509 non-null  int64 
 4   ADTOCC    157509 non-null  int64 
 5   AHGA      157509 non-null  object
 6   AHRSPAY   157509 non-null  int64 
 7   AHSCOL    157509 non-null  object
 8   AMARITL   157509 non-null  object
 9   AMJIND    157509 non-null  object
 10  AMJOCC    157509 non-null  object
 11  ARACE     157509 non-null  object
 12  AREORGN   157509 non-null  object
 13  ASEX      157509 non-null  object
 14  AUNMEM    157509 non-null  object
 15  AUNTYPE   157509 non-null  object
 16  AWKSTAT   157509 non-null  object
 17  CAPGAIN   157509 non-null  int64 
 18  CAPLOSS   157509 non-null  int64 
 19  DIVVAL    157509 non-null  int64 
 20  FILESTAT  157509 non-null 

# Missing value ayalysis

In [7]:
missing_values_info = train_df.isnull().sum() / len(train_df)

In [8]:
missing_values_info_df = pd.DataFrame()
missing_values_info_df['features'] = missing_values_info.index
missing_values_info_df['missing_values'] = missing_values_info.values

### Using plotly-express for interactive graphs. 
[https://plotly.com/python/plotly-express/](https://plotly.com/python/plotly-express/)

In [9]:
px.bar(x='missing_values', y='features', data_frame=missing_values_info_df, title='Missing values in %', color='features')

# Missing Values Imputation
* imputing with new category 'unknown'

In [10]:
selected_features = missing_values_info_df[missing_values_info_df['missing_values']>0]['features'].values

In [11]:
for col in selected_features:
    train_df[col] = train_df[col].fillna('unknown')
    test_df[col] = test_df[col].fillna('unknown')

# LabelEncoder
* Converting categorical data to numerical form. LabelEncoder encode target labels with value between 0 and n_classes-1.

* using select_dtypes('object') for retreving string columns. select_dtypes returns a subset of the DataFrame’s columns based on the column dtypes

* [Reference](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) 

In [12]:
for col in train_df.select_dtypes('object'):
    if col != 'ID':
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col])
        test_df[col] = le.transform(test_df[col])

# Prepare Train and Validation dataset

* Keeping 80% data for training and 20% for validation. 

* Using stratified approach to split the data. Stratified helps to keep distribution of target variable same for training and validation dataset. 

In [13]:
X = train_df.drop(columns=['ID','TARGET'])
print(X.shape)
y = train_df.TARGET
print(y.shape)

(157509, 40)
(157509,)


In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,
                                                        test_size=0.2, 
                                                        stratify=y, 
                                                        random_state=SEED)

In [15]:
X_train.shape, y_train.shape

((126007, 40), (126007,))

### target distribution in training 

In [16]:
y_train.value_counts()/len(y_train)

0    0.917592
1    0.082408
Name: TARGET, dtype: float64

In [17]:
X_valid.shape, y_valid.shape

((31502, 40), (31502,))

### target distribution in validation 

In [18]:
y_valid.value_counts()/len(y_valid)

0    0.917593
1    0.082407
Name: TARGET, dtype: float64

# Helper Function

In [19]:
def train_model(classifier, input_x, input_y):
    clf = classifier.fit(input_x, input_y)
    return clf

def evaluate_model(classifier, validation_x, validation_y, eval_metrics=fbeta_score):
    ypred = classifier.predict(validation_x)
    return ypred, eval_metrics(validation_y, ypred, beta=0.5)


# Comparing different ML algorithms

In [20]:
classifiers = {
    'logistic_regression' : LogisticRegression(solver='liblinear', random_state=SEED),
    'decision_tree' : DecisionTreeClassifier(random_state=SEED),
    'random_forest': RandomForestClassifier(random_state=SEED)
    }

In [21]:
trained_models = {}

for classifier_name, classifier in classifiers.items():
    
    model = train_model(classifier, X_train, y_train)
    ypred, validation_score = evaluate_model(model, X_valid, y_valid)
    trained_models[classifier_name] = {'model': model, 'f1_score': validation_score}

In [22]:
validation_results = defaultdict(list)
for k,v in trained_models.items():
    validation_results['classifier_name'].append(k)
    validation_results['f1_score'].append(v['f1_score'])
validation_results = pd.DataFrame(validation_results)

In [23]:
validation_results

Unnamed: 0,classifier_name,f1_score
0,logistic_regression,0.53732
1,decision_tree,0.464469
2,random_forest,0.647754


In [24]:
px.bar(x='f1_score', y='classifier_name', data_frame=validation_results, color="classifier_name", title='Algorithm Performance Comparison')

# Inference :

* Piciking the best model for inference which is RandomForest

In [25]:
XTest = test_df.drop(columns=['ID'])

In [26]:
ytest_pred = trained_models['random_forest']['model'].predict(XTest)

In [27]:
output = sample_sub.copy()
output['TARGET'] = ytest_pred
output.head()

Unnamed: 0,ID,TARGET
0,ai1kagv30p8v,0
1,9s9e3x6a8f7u,0
2,qlvd7mszxd2z,0
3,uwhbqcnx5a5z,0
4,27c5sqbrzdwf,1


In [28]:
output['TARGET'].value_counts()/len(output)

0    0.952863
1    0.047137
Name: TARGET, dtype: float64

In [29]:
output.to_csv('./solution.csv', index=False)