## TOC:
* [Imports](#1st)
* [Config](#2nd)
* [Data Import](#3rd)
* [2nd visualization](#4th)
* [Train Test Split](#5th)
* [Classifiers](#6th)
* [Results](#7th)


## Imports <a class="anchor" id="1st"></a>

In [131]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import accuracy_score,f1_score
from sklearn.svm import SVC
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from sklearn.utils import resample

## Config <a class="anchor" id="2nd"></a>

In [160]:
TRAIN_TEST_SPLIT = True

KNN_CLASSIFIER = True

GP_CLASSIFIER = True

RF_CLASSIFIER = True

SVM_CLASSIFIER = True

MLP_CLASSIFIER = True

OPTIMIZATION = False

## Data Import <a class="anchor" id="3rd"></a>

In [81]:
# Data Read
df_train = pd.read_csv('../../data/processed/employment_train_processed.csv')
df_test = pd.read_csv('../../data/processed/employment_test_processed.csv')

# Check Columns for preprocessing 
print(df_test.columns)
print(df_train.columns)

Index(['Unnamed: 0', 'age', 'race', 'earnwke', 'married', 'union', 'ne_states',
       'so_states', 'ce_states', 'we_states', 'government', 'private', 'self',
       'educ_lths', 'educ_hs', 'educ_somecol', 'educ_aa', 'educ_bac',
       'educ_adv', 'female'],
      dtype='object')
Index(['Id', 'age', 'race', 'earnwke', 'employed', 'unemployed', 'married',
       'union', 'ne_states', 'so_states', 'ce_states', 'we_states',
       'government', 'private', 'self', 'educ_lths', 'educ_hs', 'educ_somecol',
       'educ_aa', 'educ_bac', 'educ_adv', 'female'],
      dtype='object')


In [101]:
#DEBUG print(len(df_train))  

9276


## 2nd visual <a class="anchor" id="4th"></a>

## Partial Data Preprocesssing <a class="anchor" id="5th"></a>

In [109]:
train = df_train.values

# Shuffle for later crossval
idx = np.arange(train.shape[0])
np.random.seed(7)
np.random.shuffle(idx)
train = train[idx]

df_train = pd.DataFrame(train, columns = df_train.columns)

In [110]:
continuous_cols = ['age', 'race', 'earnwke']
categorical_cols = ['employed', 'unemployed', 'married', 'union', 'ne_states', 'so_states', 'ce_states', 'we_states', 'government', 'private', 'self', 'educ_lths', 'educ_hs',  'educ_somecol', 'educ_aa', 'educ_bac', 'educ_adv', 'female']

snv = sklearn.preprocessing.StandardScaler()

df_train[continuous_cols] = snv.fit_transform(df_train[continuous_cols])
df_test[continuous_cols] = snv.transform(df_test[continuous_cols])

## Train Test Split <a class="anchor" id="6th"></a>

In [111]:
train = df_train.values
train_ids = train[:, 0]
train_employed = train[:, 4]
train_unemployed = train[:, 5]
train_data = train[:,np.array([not i == 4 and  not i == 5  for i in range(1,23)])]
test = df_test.values 

In [112]:
if TRAIN_TEST_SPLIT:
    temp_train_ids, temp_test_ids, temp_train_data, temp_test_data, temp_train_employed, temp_test_employed = train_test_split(train_ids, train_data, train_employed, test_size=200, random_state=7)
else: temp_train_ids, temp_train_data, temp_test_employed = train_ids, train_data, train_employed

## Classifiers <a class="anchor" id="7th"></a>

In [132]:
if KNN_CLASSIFIER:
    knn_classifier = KNeighborsClassifier()
    knn_classifier.fit(temp_train_data, temp_train_employed.astype(bool))

In [138]:
if KNN_CLASSIFIER:
    temp_test_employed_pred = knn_classifier.predict(temp_test_data)

    accuracy = model_selection.cross_val_score(knn_classifier, X=train_data, y=train_employed, cv=5, scoring='accuracy')
    knn_min_acc = accuracy.min()
    print(accuracy)
    
    #print("Accuracy",sklearn.metrics.accuracy_score(temp_test_employed.astype(bool), temp_test_employed_pred))
    #print("F1-Score",sklearn.metrics.f1_score (temp_test_employed.astype(bool), temp_test_employed_pred))

[0.83351293 0.82264151 0.81940701 0.83557951 0.82371968]
0.8194070080862533


In [179]:
if GP_CLASSIFIER:
    gaussian_process_classifier = GaussianProcessClassifier()
    gaussian_process_classifier.fit(temp_train_data, temp_train_employed.astype(bool))

MemoryError: Unable to allocate 628. MiB for an array with shape (9076, 9076) and data type float64

In [None]:
if GP_CLASSIFIER:
    temp_test_employed_pred = gaussian_process_classifier.predict(temp_test_data)

    accuracy = model_selection.cross_val_score(gaussian_process_classifier, X=train_data, y=train_employed, cv=5, scoring='accuracy')
    gp_min_acc = accuracy.min()
    print(accuracy)    

    #print("Accuracy",sklearn.metrics.accuracy_score(temp_test_employed.astype(bool), temp_test_employed_pred))
    #print("F1-Score",sklearn.metrics.f1_score (temp_test_employed.astype(bool), temp_test_employed_pred))

In [161]:
if SVM_CLASSIFIER:
    svc = SVC()#kernel='linear'
    svc.fit(temp_train_data, temp_train_employed.astype(bool))

In [162]:
if SVM_CLASSIFIER:
    temp_test_employed_pred = svc.predict(temp_test_data)

    accuracy = model_selection.cross_val_score(svc, X=train_data, y=train_employed, cv=5, scoring='accuracy')
    svc_min_acc = accuracy.min()
    print(accuracy)
    
    #print("Accuracy",sklearn.metrics.accuracy_score(temp_test_employed.astype(bool), temp_test_employed_pred))
    #print("F1-Score",sklearn.metrics.f1_score (temp_test_employed.astype(bool), temp_test_employed_pred))

[0.54525862 0.53638814 0.50350404 0.52991914 0.52183288]


In [163]:
if RF_CLASSIFIER:
    random_forest_classifier = RandomForestClassifier(n_estimators=40, random_state=6)
    random_forest_classifier.fit(temp_train_data, temp_train_employed.astype(bool))

In [173]:
if RF_CLASSIFIER:
    temp_test_employed_pred = random_forest_classifier.predict(temp_test_data)

    accuracy = model_selection.cross_val_score(random_forest_classifier, X=train_data, y=train_employed, cv=5, scoring='accuracy')
    rf_min_acc = accuracy.min()
    print(accuracy)
    
    #print(sklearn.metrics.accuracy_score(temp_test_employed.astype(bool), temp_test_employed_pred))
    #print("F1-Score",sklearn.metrics.f1_score (temp_test_employed.astype(bool), temp_test_employed_pred))

[0.98006466 0.98274933 0.97951482 0.97681941 0.97789757]


In [174]:
if MLP_CLASSIFIER:
    mlp_classifier = MLPClassifier()
    mlp_classifier.fit(temp_train_data, temp_train_employed.astype(bool))    

In [175]:
if MLP_CLASSIFIER:
    temp_test_employed_pred = mlp_classifier.predict(temp_test_data)

    accuracy = model_selection.cross_val_score(mlp_classifier, X=train_data, y=train_employed, cv=5, scoring='accuracy')
    mlp_min_acc = accuracy.min()
    print(accuracy)

    #print("Accuracy",sklearn.metrics.accuracy_score(temp_test_employed.astype(bool), temp_test_employed_pred))
    #print("F1-Score",sklearn.metrics.f1_score (temp_test_employed.astype(bool), temp_test_employed_pred))

[0.72252155 0.53746631 0.68733154 0.68463612 0.55849057]


In [178]:
print("KNN:" , knn_min_acc)
print("GP:" , gp_min_acc)
print("SVM:" , svc_min_acc)
print("RF:" , rf_min_acc)
print("MLP:" , mlp_min_acc)



KNN: 0.537466307277628


NameError: name 'gp_min_acc' is not defined

## Hyperparameter Optimization

In [167]:
if SVM_CLASSIFIER and OPTIMIZATION:
    def hyperparameter_function_svc(gamma, C):
        """ Function for hyperparameter optimization
        """
        svc = SVC(gamma=gamma, C=C)
        svc.fit(temp_train_data, temp_train_employed.astype(bool))
        temp_test_employed_pred = svc.predict(temp_test_data)
        return sklearn.metrics.accuracy_score(temp_test_employed.astype(bool), temp_test_employed_pred)

    # Bounded region of parameter space
    pbounds = {'gamma': (0, 1.5), 'C': (0.00001, 2)}

    optimizer = BayesianOptimization(
        f=hyperparameter_function_svc,
        pbounds=pbounds,
        random_state=6,
    )

In [168]:
if SVM_CLASSIFIER and OPTIMIZATION:
    optimizer.maximize(
        init_points=50,
        n_iter=100,
    )

In [169]:
if SVM_CLASSIFIER and OPTIMIZATION:
    svc = sklearn.svm.SVC(gamma=2, C=0.3817)
    svc.fit(temp_train_data, temp_train_employed)
    temp_test_employed_pred = svc.predict(temp_test_data)

    print(sklearn.metrics.accuracy_score(temp_test_employed, temp_test_employed_pred))

In [170]:
if MLP_CLASSIFIER and OPTIMIZATION:
    def hyperparameter_function_mlp(lr, hl_w,hl_d ):
        """ Function for hyperparameter optimization
        """
                
        mlp_c = MLPClassifier(learning_rate_init=lr,hidden_layer_sizes=(hl_d,hl_w))
        print(temp_train_data)
        print(temp_train_employed)
        
        mlp_c.fit(temp_train_data, temp_train_employed.astype(bool))        
        temp_test_employed_pred = mlp_c.predict(temp_test_data)      
        return sklearn.metrics.accuracy_score(temp_test_employed.astype(bool), temp_test_employed_pred)


    # Bounded region of parameter space
    pbounds = {'lr': (0.1, 1), 'hl_w': (1, 15),'hl_d':(1, 15)}

    optimizer = BayesianOptimization(
        f=hyperparameter_function_mlp,
        pbounds=pbounds,
        random_state=6,
    )    

In [171]:
if MLP_CLASSIFIER and OPTIMIZATION:    
    optimizer.maximize(
        init_points=50,
        n_iter=100,
    )

In [172]:
if MLP_CLASSIFIER and OPTIMIZATION:
    if MLP_CLASSIFIER:    
    mlp_c = MLPClassifier(learning_rate_init=0.1,hidden_layer_sizes=(1,1) )
    mlp_c.fit(temp_train_data, temp_train_employed)
    temp_test_employed_pred = mlp_c.predict(temp_test_data)

    print(sklearn.metrics.accuracy_score(temp_test_employed, temp_test_employed_pred))

IndentationError: expected an indented block (3223136079.py, line 3)

## Data results <a class="anchor" id="8th"></a>

In [None]:
best_classifier = knn_classifier


print(len(temp_test_data))
print(len(temp_test_employed_pred))

final_pred = best_classifier.predict(test)

print(final_pred)

In [None]:
df_final = pd.read_csv('../../data/raw/teco-psda-exercisesheet1-employment-2024/employment_test_sample.csv',decimal=",")

print(df_final)

In [None]:
final = df_final.values
for i in range(0,len(final)):
    final[i,1] = final_pred[i]
print(final)

In [None]:
df_fine = pd.DataFrame(final, columns = df_final.columns)


df_fine.to_csv('../../data/predictions/employment_test_final.csv',index=False)