In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data = pd.read_csv("p1_data.csv")

enc_data = pd.DataFrame()

label_enc_col = ("department","salary")\

for column in data.columns:
    if column in label_enc_col:
      enc = LabelEncoder()
      enc_col = enc.fit_transform(data[column])
      enc_data[column]=enc_col
    elif column != "left":
      enc = OneHotEncoder()
      enc_col = enc.fit_transform(data[column].values.reshape(-1,1)).toarray()
      col_name = [f"{column}_{i}" for i in range(enc_col.shape[1])]
      enc_data = pd.concat([enc_data, pd.DataFrame(enc_col, columns=col_name)], axis=1)
    else :
      enc_data[column] = data[column]

enc_data.to_csv("encodeddata.csv",index=False )

In [3]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict, train_test_split, KFold 
from sklearn.metrics import precision_score,  confusion_matrix, recall_score, f1_score

shuffled_data = enc_data.iloc[np.random.permutation(len(enc_data))]

# Separate features and target variable
X = shuffled_data.drop("left", axis=1).values
Y = shuffled_data["left"].values

classifier=SGDClassifier(random_state = 42)

kf=KFold(n_splits=5, shuffle=True, random_state=42)

train_test_percentage = [(0.85,0.15),(0.75,0.25),(0.65,0.35)]

for train_percentage, test_percentage in train_test_percentage:
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_percentage, random_state=42)

  Y_pred = cross_val_predict(classifier, X_test, Y_test, cv=kf)

  precision = precision_score(Y_test, Y_pred)

  recall = recall_score(Y_test, Y_pred)

  f1 = f1_score(Y_test, Y_pred)

  confusion_mat = confusion_matrix(Y_test, Y_pred)

  print("Training: ", train_percentage)
  print("Testing: ", test_percentage)
  print("Precision:", precision)
  print("Recall:", recall)
  print("F1 Score:", f1)
  print("Confusion Matrix:")
  print(confusion_mat)

Training:  0.85
Testing:  0.15
Precision: 0.8905109489051095
Recall: 0.7305389221556886
F1 Score: 0.8026315789473684
Confusion Matrix:
[[1475   30]
 [  90  244]]
Training:  0.75
Testing:  0.25
Precision: 0.8523206751054853
Recall: 0.7579737335834896
F1 Score: 0.8023833167825224
Confusion Matrix:
[[2462   70]
 [ 129  404]]
Training:  0.65
Testing:  0.35
Precision: 0.821656050955414
Recall: 0.8634538152610441
F1 Score: 0.8420365535248042
Confusion Matrix:
[[3404  140]
 [ 102  645]]
