In [1]:
from sklearn import datasets, svm
from sklearn.model_selection import cross_val_score, KFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import pandas as pd

In [2]:
def run_svm(data, label):
  clf = svm.SVC(kernel='linear')

  kf = KFold(n_splits=10, shuffle=True, random_state=42)

  svm_scores = cross_validate(clf, data, label, cv=kf, scoring=['accuracy','f1_weighted','recall_weighted'])

  print("SVM:")
  print("Accuracy: %0.4f (+/- %0.4f)" % (svm_scores['test_accuracy'].mean(), svm_scores['test_accuracy'].std() * 4))
  print("F1 Score: %0.4f (+/- %0.4f)" % (svm_scores['test_f1_weighted'].mean(), svm_scores['test_f1_weighted'].std() * 4))
  print("recall: %0.4f (+/- %0.4f)" % (svm_scores['test_recall_weighted'].mean(), svm_scores['test_recall_weighted'].std() * 4))

def run_rf(data, label):
  rf_classifier = RandomForestClassifier(n_estimators=100)

  kf = KFold(n_splits=10, shuffle=True, random_state=42)

  rf_scores = cross_validate(rf_classifier, data, label, cv=kf, scoring=['accuracy','f1_weighted','recall_weighted'])

  print("RF:")
  print("Accuracy: %0.4f (+/- %0.4f)" % (rf_scores['test_accuracy'].mean(), rf_scores['test_accuracy'].std() * 4))
  print("F1 Score: %0.4f (+/- %0.4f)" % (rf_scores['test_f1_weighted'].mean(), rf_scores['test_f1_weighted'].std() * 4))
  print("recall: %0.4f (+/- %0.4f)" % (rf_scores['test_recall_weighted'].mean(), rf_scores['test_recall_weighted'].std() * 4))

def run_nlp(data, label):
  mlp_classifier = MLPClassifier()

  kf = KFold(n_splits=10, shuffle=True, random_state=42)

  mlp_scores = cross_validate(mlp_classifier, data, label, cv=kf, scoring=['accuracy','f1_weighted','recall_weighted'])

  print("MLP:")
  print("Accuracy: %0.4f (+/- %0.4f)" % (mlp_scores['test_accuracy'].mean(), mlp_scores['test_accuracy'].std() * 4))
  print("F1 Score: %0.4f (+/- %0.4f)" % (mlp_scores['test_f1_weighted'].mean(), mlp_scores['test_f1_weighted'].std() * 4))
  print("recall: %0.4f (+/- %0.4f)" % (mlp_scores['test_recall_weighted'].mean(), mlp_scores['test_recall_weighted'].std() * 4))

def run(data, label):
  run_svm(data, label)
  print("\n")
  run_rf(data, label)
  print("\n")
  run_nlp(data, label)

In [3]:
def sort_score(df1, df2):
  df1 = df1.sort_values(by=['reading_score'])
  df1['reading_score'][:333] = 1
  df1['reading_score'][333:667] = 2
  df1['reading_score'][667:] = 3

  df1 = df1.sort_values(by=['writing_score'])
  df1['writing_score'][:333] = 1
  df1['writing_score'][333:667] = 2
  df1['writing_score'][667:] = 3

  df2 = df2.sort_values(by=['reading_score'])
  df2['reading_score'][:333] = 1
  df2['reading_score'][333:667] = 2
  df2['reading_score'][667:] = 3

  df2 = df2.sort_values(by=['writing_score'])
  df2['writing_score'][:333] = 1
  df2['writing_score'][333:667] = 2
  df2['writing_score'][667:] = 3
  return df1, df2

In [6]:
# drop test preparation course
def drop_test(path):
  df = pd.read_csv(path).drop("math_score",axis=1).drop("test_preparation_course", axis=1)

  df = pd.get_dummies(df,columns=["race_ethnicity"], prefix='', prefix_sep='')
  cols = ["group A", "group B", "group C", "group D", "group E", "other"]
  for col in cols:
    if col not in df.columns.tolist():
      df[col] = 0
  df.rename(columns={"group A": "group_A", "group B": "group_B", "group C": "group_C", "group D": "group_D", "group E": "group_E"})

  df["parental_level_of_education"] = df["parental_level_of_education"].map({"bachelor's and master's degree": 4, "master's degree": 4, "bachelor's degree": 4, "associate's degree": 3, "some college": 2, "before college": 1, "high school": 1, "some high school": 1})
  df = pd.get_dummies(df, columns=["gender"], prefix='', prefix_sep='')
  df["lunch"] = df["lunch"].map({"standard": 1, "free/reduced": 0})

  return df

def test():
  df1, df2 = sort_score(drop_test("StudentsPerformance.csv"), drop_test("k9_mark_test_preparation_course.csv"))
  print("df1:")
  run(df1.drop("writing_score",axis=1), df1["writing_score"])
  print("\ndf2:")
  run(df2.drop("writing_score",axis=1), df2["writing_score"])

In [9]:
test()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['reading_score'][:333] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['reading_score'][333:667] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['reading_score'][667:] = 3
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['writing_score'][:333] = 1
A value is trying to be set on a copy of 

df1:
SVM:
Accuracy: 0.8150 (+/- 0.1190)
F1 Score: 0.8152 (+/- 0.1172)
recall: 0.8150 (+/- 0.1190)


RF:
Accuracy: 0.7760 (+/- 0.1189)
F1 Score: 0.7766 (+/- 0.1233)
recall: 0.7760 (+/- 0.1189)






MLP:
Accuracy: 0.8130 (+/- 0.1303)
F1 Score: 0.8139 (+/- 0.1293)
recall: 0.8130 (+/- 0.1303)

df2:
SVM:
Accuracy: 0.8150 (+/- 0.1190)
F1 Score: 0.8152 (+/- 0.1172)
recall: 0.8150 (+/- 0.1190)


RF:
Accuracy: 0.7870 (+/- 0.1902)
F1 Score: 0.7869 (+/- 0.1913)
recall: 0.7870 (+/- 0.1902)






MLP:
Accuracy: 0.8130 (+/- 0.1118)
F1 Score: 0.8135 (+/- 0.1109)
recall: 0.8130 (+/- 0.1118)


