In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## EDA

In [3]:
df=pd.read_csv('../final-project/files/aug_train.csv')
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [None]:
df.shape

In [None]:
df.info()

In [None]:
# This data set consists mainly of categorical columns. Some of these need to be transformed to numerical data
# The target variable is a categorical column, hence this will be a binary clustering problem. 

In [None]:
df.describe()

In [8]:
# There are a lot of missing values, which need to be filled
nulls = pd.DataFrame(df.isna().sum()*100/len(df), columns=['percentage'])
nulls.sort_values('percentage', ascending = False).head(80)

Unnamed: 0,percentage
last_new_job,2.207955
experience,0.339284
enrollee_id,0.0
city,0.0
city_development_index,0.0
gender,0.0
relevent_experience,0.0
enrolled_university,0.0
education_level,0.0
major_discipline,0.0


In [None]:
# There is a hudge inbalance in the target variable inside each column. 
# Most of candidates dont want to change their job after finishing the training
df.groupby('target').count()

## Data Cleaning

In [4]:
df.set_index('enrollee_id')

Unnamed: 0_level_0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
enrollee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [9]:
# Replacing NaNs with the mode of the columns
df['education_level'] = df['education_level'].fillna(df['education_level'].mode()[0])
df['company_type'] = df['company_type'].fillna(df['company_type'].mode()[0])
df['company_size'] = df['company_size'].fillna(df['company_size'].mode()[0])
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['major_discipline'] = df['major_discipline'].fillna(df['major_discipline'].mode()[0])
df['enrolled_university'] = df['enrolled_university'].fillna(df['enrolled_university'].mode()[0])
df['last_new_job'] = df['last_new_job'].fillna(df['last_new_job'].mode()[0])
df['experience'] = df['experience'].fillna(df['experience'].mode()[0])

In [10]:
# Removing the NaNs by filing them with a new category.
#df['relevent_experience'].unique() 

# Function to impute null value with new category
#def impute_nan_create_category(df,ColName):
     #df[ColName] = np.where(df[ColName].isnull(),"unknown",df[ColName])

# Call function to create new category for variables
#for Columns in ['education_level','company_type','company_size','gender','major_discipline','enrolled_university']:
    #impute_nan_create_category(df,Columns)

# Display result
#df[['education_level','company_type','company_size','gender','major_discipline','enrolled_university']].head(10)


In [11]:
# Removing the highest and lowest values of 'company_size' and 'last_new_job' 
df['last_new_job'].replace(['>4','never'],['4','0'],inplace=True)
df['last_new_job'].fillna(df['last_new_job'].value_counts().index[0],inplace=True)
df['last_new_job'] = [float(i) for i in df['last_new_job']]

df['experience'].replace(['>20','<1'],['20','1'],inplace=True)
df['experience'].fillna(df['experience'].value_counts().index[0],inplace=True)
df['experience'] = [float(i) for i in df['experience']]

In [12]:
df.isnull().values.any()

False

In [13]:
# Convert values of company_size to small, medium, large, very large
df['company_size'].replace(['<10','10/49', '50-99', '100-500', '500-999', '1000-4999', '5000-9999', '10000+',],
                             ['Small','Small','Small','Medium','Medium','Large','Large','Large'],inplace=True)

In [17]:
df['education_level'].unique()

array(['Graduate', 'Masters', 'High School', 'Phd', 'Primary School'],
      dtype=object)

In [24]:
education_level_ordinal=[]
for i in df['education_level']:
    if i =='Primary School':
        education_level_ordinal.append(0)
    elif i == 'High School':
        education_level_ordinal.append(1)
    elif i == 'Graduate':
        education_level_ordinal.append(2)
    elif i == 'Masters':
        education_level_ordinal.append(3)
    elif i == 'Phd':
        education_level_ordinal.append(4)
    else:
        education_level_ordinal.append(5)

df['education_level']=education_level_ordinal

df['education_level']

0        2
1        2
2        2
3        2
4        3
        ..
19153    2
19154    2
19155    2
19156    1
19157    0
Name: education_level, Length: 19158, dtype: int64

In [26]:
df['company_size'].unique()

array(['Small', 'Large', 'Medium'], dtype=object)

In [28]:
company_size_ordinal=[]
for i in df['company_size']:
    if i =='Small':
        company_size_ordinal.append(0)
    elif i == 'Medium':
        company_size_ordinal.append(1)
    elif i == 'Large':
        company_size_ordinal.append(3)
    else:
        company_size_ordinal.append(4)

df['company_size']=company_size_ordinal

df['company_size']

# Encoding

#Ordinal to education_level
#Ordinal to company_size
#Ordinal to 
#Ordinal

0        0
1        0
2        0
3        0
4        0
        ..
19153    0
19154    0
19155    0
19156    1
19157    0
Name: company_size, Length: 19158, dtype: int64

In [29]:
model_df = df.copy()

In [30]:
X = df.drop('target', axis=1)
y = df['target']

In [34]:
# Dealing with numerical and categorical data
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = np.object)
print (X.shape, X_num.shape, X_cat.shape)

(19158, 13) (19158, 7) (19158, 6)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_cat = X.select_dtypes(include = np.object)


In [35]:
# One Hot/Label Encoding (categorical)
encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)

OneHotEncoder(drop='first')

In [37]:
encoded = encoder.transform(X_cat).toarray()
encoded.shape

(19158, 137)

In [38]:
# Concat DataFrames
X = np.concatenate([X_num, encoded], axis=1)
X.shape

(19158, 144)


## Training the Benchmark Model/Logistic regression

In [39]:
#Splitting the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [40]:
# using standard scaler
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler() # StandardScaler and MinMax scaler works row wise,
# so make sure you do the train-test split first (optional)
# (train-test split comes first, 
#or else your trainning set will carry information from your test set)
transformer.fit(X_train)

x_standardized = transformer.transform(X_train)
x_standardized.shape
#pd.DataFrame(x_standardized)



(15326, 144)

In [41]:
x_test = transformer.transform(X_test)

In [42]:
classification = LogisticRegression(random_state=42, max_iter=500) # max_iter
classification.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=42)

In [43]:
predictions = classification.predict(x_test)
confusion_matrix(y_test, predictions)

array([[2294,  586],
       [ 372,  580]])

In [44]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.86      0.80      0.83      2880
         1.0       0.50      0.61      0.55       952

    accuracy                           0.75      3832
   macro avg       0.68      0.70      0.69      3832
weighted avg       0.77      0.75      0.76      3832



## ???

In [None]:
# Downsampling - TomekLinks
from imblearn.under_sampling import TomekLinks

# watch out not to inflate your metrics, ideally:
# you do the train-test split first and fit_resample only on the training set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X_train), y_train)

y_tl.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X_sm, y_sm = smote.fit_resample(np.array(X_train), y_train)
y_sm.value_counts()

In [None]:
# Can i deal with unbalanced data after encoding?

## Feature Engineering

In [None]:
from scipy import stats

def boxcox_transform(df):
    numeric_cols = df.select_dtypes(np.number).columns
    _ci = {column: None for column in numeric_cols}
    for column in numeric_cols:
        # since i know any columns should take negative numbers, to avoid -inf in df
        df[column] = np.where(df[column]<=0, np.NAN, df[column]) 
        df[column] = df[column].fillna(df[column].mean())
        transformed_data, ci = stats.boxcox(df[column])
        df[column] = transformed_data
        _ci[column] = [ci] 
    return df, _ci

df, _ci = boxcox_transform(data)
df

In [None]:
"""
tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_resample(np.array(X_train), y_train)
print('1st TkLinks:')
print(y_tl.value_counts())

smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X_tl, y_tl)
print('SMOTE:')
print(y_sm.value_counts())

tl = TomekLinks('all')
X_tl, y_tl = tl.fit_resample(X_sm, y_sm)
print('2nd TkLinks:')
print(y_tl.value_counts())
"""

## Classification Models

In [None]:
from imblearn.under_sampling import TomekLinks

X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.3, random_state=42)

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X_train), y_train)

y_tl.value_counts()

In [None]:
# Logistic Regression

In [None]:
# KNN

In [None]:
# Random Forest

In [None]:
# Decision Tree