In [96]:
# Uncomment following lines to install plotly if it's not installed 

!pip install plotly
!pip install --upgrade nbformat

[0m

# Load required libraries

In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
import plotly.express as px

# SEED Everything : for reproducibility 

In [98]:
SEED = 42

def seed_everything(seed = 42):
    import random, os
    import numpy as np
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(SEED)

# Load dataset 

In [99]:
train_df = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Train.csv')
test_df = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Test.csv')
sample_sub = pd.read_csv('../input/kagglex-bipoc-2022-2023-ml-foundation/Sample_submission.csv')

In [100]:
train_df.rename(columns = {
#             'ID': , 
            'AAGE': 'age', 
            'ACLSWKR': 'class of worker', 
            'ADTIND': 'industry code', 
            'ADTOCC': 'occupation code', 
            'AHGA':'education', 
            'AHRSPAY':'wage per hour',
            'AHSCOL':'enrolled in edu inst last wk', 
            'AMARITL':'marital status', 
            'AMJIND':'major industry code', 
            'AMJOCC':'major occupation code', 
            'ARACE':'race', 
            'AREORGN':'hispanic origin', 
            'ASEX':'gender',
            'AUNMEM':'member of a labor union', 
            'AUNTYPE':'reason for unemployment', 
            'AWKSTAT':'full or part time employment stat', 
            'CAPGAIN':'capital gains', 
            'CAPLOSS':'capital losses', 
            'DIVVAL':'divdends from stocks',
            'FILESTAT': 'tax filer status', 
            'GRINREG':'region of previous residence', 
            'GRINST':'state of previous residence', 
            'HHDFMX':'detailed household and family stat', 
            'HHDREL':'detailed household summary in household', 
            'MIGMTR1':'migration code-change in msa',
            'MIGMTR3':'migration code-change in reg', 
            'MIGMTR4':'migration code-move within reg', 
            'MIGSAME':'live in this house 1 year ago', 
            'MIGSUN':'migration prev res in sunbelt', 
            'NOEMP':'num persons worked for employer', 
            'PARENT':'family members under 18',
            'PEFNTVTY':'country of birth father', 
            'PEMNTVTY':'country of birth mother', 
            'PENATVTY':'country of birth self', 
            'PRCITSHP':'citizenship', 
            'SEOTR':'own business or self employed', 
            'VETQVA':'fill inc questionnaire for veteran\'s admin',
            'VETYN':'veterans benefits', 
            'WKSWORK':'weeks worked in year', 
#             'YEAR', 
#             'TARGET'    
}, inplace=True)

In [101]:
test_df.rename(columns = {
#             'ID': , 
            'AAGE': 'age', 
            'ACLSWKR': 'class of worker', 
            'ADTIND': 'industry code', 
            'ADTOCC': 'occupation code', 
            'AHGA':'education', 
            'AHRSPAY':'wage per hour',
            'AHSCOL':'enrolled in edu inst last wk', 
            'AMARITL':'marital status', 
            'AMJIND':'major industry code', 
            'AMJOCC':'major occupation code', 
            'ARACE':'race', 
            'AREORGN':'hispanic origin', 
            'ASEX':'gender',
            'AUNMEM':'member of a labor union', 
            'AUNTYPE':'reason for unemployment', 
            'AWKSTAT':'full or part time employment stat', 
            'CAPGAIN':'capital gains', 
            'CAPLOSS':'capital losses', 
            'DIVVAL':'divdends from stocks',
            'FILESTAT': 'tax filer status', 
            'GRINREG':'region of previous residence', 
            'GRINST':'state of previous residence', 
            'HHDFMX':'detailed household and family stat', 
            'HHDREL':'detailed household summary in household', 
            'MIGMTR1':'migration code-change in msa',
            'MIGMTR3':'migration code-change in reg', 
            'MIGMTR4':'migration code-move within reg', 
            'MIGSAME':'live in this house 1 year ago', 
            'MIGSUN':'migration prev res in sunbelt', 
            'NOEMP':'num persons worked for employer', 
            'PARENT':'family members under 18',
            'PEFNTVTY':'country of birth father', 
            'PEMNTVTY':'country of birth mother', 
            'PENATVTY':'country of birth self', 
            'PRCITSHP':'citizenship', 
            'SEOTR':'own business or self employed', 
            'VETQVA':'fill inc questionnaire for veteran\'s admin',
            'VETYN':'veterans benefits', 
            'WKSWORK':'weeks worked in year', 
#             'YEAR', 
#             'TARGET'    
}, inplace=True)

In [102]:
train_df.head()

Unnamed: 0,ID,age,class of worker,industry code,occupation code,education,wage per hour,enrolled in edu inst last wk,marital status,major industry code,...,country of birth father,country of birth mother,country of birth self,citizenship,own business or self employed,fill inc questionnaire for veteran's admin,veterans benefits,weeks worked in year,YEAR,TARGET
0,shydl6zxik4o,22,Not in universe,0,0,High school graduate,0,Not in universe,Divorced,Not in universe or children,...,United-States,United-States,United-States,Native- Born in the United States,2,Not in universe,2,36,1995,0
1,2dhm421r62rr,37,Private,35,33,Some college but no degree,0,Not in universe,Divorced,Finance insurance and real estate,...,United-States,United-States,United-States,Native- Born in the United States,2,Not in universe,2,47,1995,0
2,ys7nzza2hffj,83,Not in universe,0,0,1st 2nd 3rd or 4th grade,0,Not in universe,Widowed,Not in universe or children,...,Puerto-Rico,Puerto-Rico,Puerto-Rico,Native- Born in Puerto Rico or U S Outlying,0,Not in universe,2,0,1995,0
3,iqrpxh3sr7n1,46,Private,32,35,High school graduate,0,Not in universe,Married-civilian spouse present,Wholesale trade,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,1995,0
4,rr8vlzk9iwyn,35,Private,34,26,Associates degree-academic program,0,Not in universe,Married-civilian spouse present,Finance insurance and real estate,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,45,1995,0


In [103]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157509 entries, 0 to 157508
Data columns (total 42 columns):
 #   Column                                      Non-Null Count   Dtype 
---  ------                                      --------------   ----- 
 0   ID                                          157509 non-null  object
 1   age                                         157509 non-null  int64 
 2   class of worker                             157509 non-null  object
 3   industry code                               157509 non-null  int64 
 4   occupation code                             157509 non-null  int64 
 5   education                                   157509 non-null  object
 6   wage per hour                               157509 non-null  int64 
 7   enrolled in edu inst last wk                157509 non-null  object
 8   marital status                              157509 non-null  object
 9   major industry code                         157509 non-null  object
 10  major oc

# Missing value ayalysis

In [104]:
missing_values_info = train_df.isnull().sum() / len(train_df)

In [105]:
missing_values_info_df = pd.DataFrame()
missing_values_info_df['features'] = missing_values_info.index
missing_values_info_df['missing_values'] = missing_values_info.values

### Using plotly-express for interactive graphs. 
[https://plotly.com/python/plotly-express/](https://plotly.com/python/plotly-express/)

In [106]:
px.bar(x='missing_values', y='features', data_frame=missing_values_info_df, title='Missing values in %', color='features')

# Missing Values Imputation
* imputing with new category 'unknown'

In [107]:
selected_features = missing_values_info_df[missing_values_info_df['missing_values']>0]['features'].values

In [108]:
for col in selected_features:
    train_df[col] = train_df[col].fillna('unknown')
    test_df[col] = test_df[col].fillna('unknown')

# LabelEncoder
* Converting categorical data to numerical form. LabelEncoder encode target labels with value between 0 and n_classes-1.

* using select_dtypes('object') for retreving string columns. select_dtypes returns a subset of the DataFrame’s columns based on the column dtypes

* [Reference](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) 

In [109]:
for col in train_df.select_dtypes('object'):
    if col != 'ID':
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col])
        test_df[col] = le.transform(test_df[col])

# Prepare Train and Validation dataset

* Keeping 80% data for training and 20% for validation. 

* Using stratified approach to split the data. Stratified helps to keep distribution of target variable same for training and validation dataset. 

In [110]:
train_df = train_df[['ID', 
         'age', 
         #'class of worker', 'industry code', 'occupation code',
       'education', 
         #'wage per hour', 'enrolled in edu inst last wk',
       #'marital status', 'major industry code', 'major occupation code',
       'race', 
         #'hispanic origin', 
         'gender', 
#          'member of a labor union',
#        'reason for unemployment', 'full or part time employment stat',
#        'capital gains', 'capital losses', 'divdends from stocks',
#        'tax filer status', 'region of previous residence',
#        'state of previous residence', 'detailed household and family stat',
#        'detailed household summary in household',
#        'migration code-change in msa', 'migration code-change in reg',
#        'migration code-move within reg', 'live in this house 1 year ago',
#        'migration prev res in sunbelt', 'num persons worked for employer',
#        'family members under 18', 'country of birth father',
#        'country of birth mother', 'country of birth self', 
         'citizenship',
#        'own business or self employed',
#        'fill inc questionnaire for veteran\'s admin', 'veterans benefits',
#        'weeks worked in year', 'YEAR', 
         'TARGET'
                    ]].copy()

In [None]:
XTest = test_df[['ID', 
         'age', 
         #'class of worker', 'industry code', 'occupation code',
       'education', 
         #'wage per hour', 'enrolled in edu inst last wk',
       #'marital status', 'major industry code', 'major occupation code',
       'race', 
         #'hispanic origin', 
         'gender', 
#          'member of a labor union',
#        'reason for unemployment', 'full or part time employment stat',
#        'capital gains', 'capital losses', 'divdends from stocks',
#        'tax filer status', 'region of previous residence',
#        'state of previous residence', 'detailed household and family stat',
#        'detailed household summary in household',
#        'migration code-change in msa', 'migration code-change in reg',
#        'migration code-move within reg', 'live in this house 1 year ago',
#        'migration prev res in sunbelt', 'num persons worked for employer',
#        'family members under 18', 'country of birth father',
#        'country of birth mother', 'country of birth self', 
         'citizenship',
#        'own business or self employed',
#        'fill inc questionnaire for veteran\'s admin', 'veterans benefits',
#        'weeks worked in year', 'YEAR' 
                ]].copy()

In [111]:
X = train_df.drop(columns=['ID','TARGET'])
print(X.shape)
y = train_df.TARGET
print(y.shape)

(157509, 5)
(157509,)


In [112]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,
                                                        test_size=0.2, 
                                                        stratify=y, 
                                                        random_state=SEED)

In [113]:
X_train.shape, y_train.shape

((126007, 5), (126007,))

### target distribution in training 

In [114]:
y_train.value_counts()/len(y_train)

0    0.917592
1    0.082408
Name: TARGET, dtype: float64

In [115]:
X_valid.shape, y_valid.shape

((31502, 5), (31502,))

### target distribution in validation 

In [116]:
y_valid.value_counts()/len(y_valid)

0    0.917593
1    0.082407
Name: TARGET, dtype: float64

# Helper Function

In [117]:
def train_model(classifier, input_x, input_y):
    clf = classifier.fit(input_x, input_y)
    return clf

def evaluate_model(classifier, validation_x, validation_y, eval_metrics=fbeta_score):
    ypred = classifier.predict(validation_x)
    return ypred, eval_metrics(validation_y, ypred, beta=0.5)


# Comparing different ML algorithms

In [118]:
classifiers = {
    'logistic_regression' : LogisticRegression(solver='liblinear', random_state=SEED),
    'decision_tree' : DecisionTreeClassifier(random_state=SEED),
    'random_forest': RandomForestClassifier(random_state=SEED)
    }

In [119]:
trained_models = {}

for classifier_name, classifier in classifiers.items():
    
    model = train_model(classifier, X_train, y_train)
    ypred, validation_score = evaluate_model(model, X_valid, y_valid)
    trained_models[classifier_name] = {'model': model, 'f1_score': validation_score}

In [120]:
validation_results = defaultdict(list)
for k,v in trained_models.items():
    validation_results['classifier_name'].append(k)
    validation_results['f1_score'].append(v['f1_score'])
validation_results = pd.DataFrame(validation_results)

In [121]:
validation_results

Unnamed: 0,classifier_name,f1_score
0,logistic_regression,0.0
1,decision_tree,0.37669
2,random_forest,0.395511


In [122]:
px.bar(x='f1_score', y='classifier_name', data_frame=validation_results, color="classifier_name", title='Algorithm Performance Comparison')

# Inference :

* Piciking the best model for inference which is RandomForest

In [123]:
XTest = test_df[['ID', 
         'age', 
         #'class of worker', 'industry code', 'occupation code',
       'education', 
         #'wage per hour', 'enrolled in edu inst last wk',
       #'marital status', 'major industry code', 'major occupation code',
       'race', 
         #'hispanic origin', 
         'gender', 
#          'member of a labor union',
#        'reason for unemployment', 'full or part time employment stat',
#        'capital gains', 'capital losses', 'divdends from stocks',
#        'tax filer status', 'region of previous residence',
#        'state of previous residence', 'detailed household and family stat',
#        'detailed household summary in household',
#        'migration code-change in msa', 'migration code-change in reg',
#        'migration code-move within reg', 'live in this house 1 year ago',
#        'migration prev res in sunbelt', 'num persons worked for employer',
#        'family members under 18', 'country of birth father',
#        'country of birth mother', 'country of birth self', 
         'citizenship',
#        'own business or self employed',
#        'fill inc questionnaire for veteran\'s admin', 'veterans benefits',
#        'weeks worked in year', 'YEAR' 
                ]].copy()

In [124]:
XTest = test_df.drop(columns=['ID'])

In [125]:
ytest_pred = trained_models['random_forest']['model'].predict(XTest)


The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names unseen at fit time:
- YEAR
- capital gains
- capital losses
- class of worker
- country of birth father
- ...
Feature names must be in the same order as they were in fit.




ValueError: X has 40 features, but RandomForestClassifier is expecting 5 features as input.

In [None]:
output = sample_sub.copy()
output['TARGET'] = ytest_pred
output.head()

In [None]:
output['TARGET'].value_counts()/len(output)

In [None]:
output.to_csv('./solution.csv', index=False)