In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [103]:
data_training = pd.read_csv("data/aug_train.csv")
data_testing = pd.read_csv("data/aug_test.csv")

In [104]:
data_training.head()

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target
0,0.624,Male,No relevent experience,no_enrollment,High School,,5,,never,21,0
1,0.926,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,>4,12,0
2,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,Public Sector,>4,26,0
3,0.624,Male,No relevent experience,Full time course,High School,,1,,never,30,1
4,0.92,Female,Has relevent experience,no_enrollment,Masters,STEM,>20,,>4,46,0


In [105]:
def clean_columns(df):
    # Replace '>20' with 21 and '<1' with 1 in the experience column
    df['experience'] = df['experience'].replace({'>20': 21, '<1': 1})
    # Impute missing values with the median and convert to integer
    df['experience'] = df['experience'].fillna(df['experience'].median()).astype(int)
    
    # Replace '>4' with 5 and 'never' with 0 in the last_new_job column
    df['last_new_job'] = df['last_new_job'].replace({'>4': 5, 'never': 0})
    # Impute missing values with the median and convert to integer
    df['last_new_job'] = df['last_new_job'].fillna(df['last_new_job'].median()).astype(int)
    
    return df


In [106]:
# Apply the function to both training and testing data
data_training = clean_columns(data_training)
data_testing = clean_columns(data_testing)

# Display the first few rows to verify the changes
print(data_training[['experience', 'last_new_job']].head())
print(data_testing[['experience', 'last_new_job']].head())

   experience  last_new_job
0           5             0
1          21             5
2          21             5
3           1             0
4          21             5
   experience  last_new_job
0           3             1
1           5             1
2          10             2
3          10             0
4           3             1


In [107]:
# Handle missing values in categorical columns
categorical_columns = ['gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'company_type']
for col in categorical_columns:
    data_training[col].fillna('Unknown', inplace=True)
    data_testing[col].fillna('Unknown', inplace=True)

In [108]:
# Encode categorical variables
le = LabelEncoder()
for col in categorical_columns:
    data_training[col] = le.fit_transform(data_training[col])
    data_testing[col] = le.transform(data_testing[col])

In [109]:
# Split the training data into features and target
X = data_training.drop(columns=['target'])
y = data_training['target']

# Standardize numerical features
numerical_columns = ['city_development_index', 'experience', 'training_hours']
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])


In [110]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [111]:
clf = RandomForestClassifier(n_estimators=1,max_features=7)

clf.fit(X_train, y_train)

# Predict on the training set
y_train_pred = clf.predict(X_train)

# Evaluate the model on the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_confusion_matrix = confusion_matrix(y_train, y_train_pred)

print(f'Training Accuracy: {train_accuracy}')
print(f'Training Precision: {train_precision}')
print(f'Training Recall: {train_recall}')
print(f'Training F1-score: {train_f1}')
print('Training Confusion Matrix:')
print(train_confusion_matrix)

Training Accuracy: 0.8976190476190476
Training Precision: 0.7867298578199052
Training Recall: 0.8019323671497585
Training F1-score: 0.7942583732057417
Training Confusion Matrix:
[[1176   90]
 [  82  332]]


In [112]:
# Predict on the testing set
X_test = data_testing.drop(columns=['target']) 
y_test = data_testing['target']


# Make predictions
predictions = clf.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.5700
Precision: 0.1111
Recall: 0.1364
F1-score: 0.1224
Confusion Matrix:
[[54 24]
 [19  3]]


#### Extra point

Think about what kind of the method can increase the performance

To potentially increase the performance of our classification model, we can consider several methods like 
Identify and select the most relevant features that contribute most to the prediction task.
Feature Transformation: Transform features using techniques like scaling (e.g., StandardScaler), normalization, or applying transformations like log or power transformations to better fit the assumptions of the model.