In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import preprocessing, metrics

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:

def bin_cat(df, cat):
    result = {}
    values = df[cat].unique()
    for val in values:
        new_cat = '{}_{}'.format(cat, val)
        result[new_cat] = (df[cat] == val).astype(int)
    return result

def bin_num1(df, cat):
    result = {}
    values = df[cat].unique()
    for val in values:
        if int(val) == 0:
            continue
        new_cat = '{}_{}'.format(cat, val)
        result[new_cat] = ((df[cat] >= 0) & (df[cat] < val)).astype(int)
    return result

def bin_num2(df, cat, ints):
    result = {}
    for i in ints:
        new_cat = '{}_{}-{}'.format(cat, i[0], i[1])
        result[new_cat] = ((df[cat] >= i[0]) & (df[cat] < i[1])).astype(int)
    return result

def bin_tgt(df, thresh):
    return {
        'class': (df[['G1', 'G2', 'G3']].mean(axis=1) / 20 >= thresh).astype(int),
    }

def get_dataset(datasetfname, sep=';', binarize=True, thresh=0.53, label_cat=True):
    data = {}
    
    df = pd.read_csv(datasetfname, sep=sep)
    
    cat = ['sex', 'address', 'famsize']
    num1 = ['Fedu', 'Medu', 'failures']
    num2 = {'absences': [[0, 1], [1, 2], [2, 3], [3, 6], [6, 999]]}
    
    if binarize:
        for c in cat:
            data.update(bin_cat(df, c))
        for c in num1:
            data.update(bin_num1(df, c))
        for c in num2:
            data.update(bin_num2(df, c, num2[c]))
    else:
        for c in cat:
            data.update({c: df[c]})
        for c in num1:
            data.update({c: df[c]})
        for c in num2:
            data.update({c: df[c]})
    data.update(bin_tgt(df, thresh))
    
    dataset = pd.DataFrame(data)
    
    if not binarize and label_cat:
        le = preprocessing.LabelEncoder()
        for c in cat:
            le.fit(dataset[c].unique())
            dataset[c] = le.transform(dataset[c])
    
    X = dataset[[d for d in dataset.head() if d != 'class']]
    y = dataset['class']
    
    return X, y

dataset = get_dataset('dataset/student-mat.csv', binarize=False, label_cat=True, thresh=0.8)

In [6]:
def get_cross_val_scores(clf_title, clf, X, y, cv=4):
    scoring = ['accuracy', 'precision', 'recall', 'f1']
    score = cross_validate(clf, X, y, cv=cv, scoring=scoring)
    print("""{} scores:
           accuracy: {:.3f}
           precision: {:.3f}
           recall: {:.3f}
           F1: {:.3f}""".format(
        clf_title,
        score['test_accuracy'].mean(), 
        score['test_precision'].mean(), 
        score['test_recall'].mean(), 
        score['test_f1'].mean())
    )

In [7]:
def cross_val_test(datasetfname, binarize=False, thresh=0.53):
    X, y = get_dataset(datasetfname, binarize=binarize, thresh=thresh)
    dcc_clf = DecisionTreeClassifier()
    get_cross_val_scores('Decision Tree', dcc_clf, X, y)
    print('')
    knn_clf = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
    get_cross_val_scores('K Nearest Neighbors', knn_clf, X, y)
    print('')
    nb_clf = GaussianNB()
    get_cross_val_scores('Naive Bayes', nb_clf, X, y)
    print('')
    svc_clf = SVC(kernel='linear', C=1)
    get_cross_val_scores('SVC', svc_clf, X, y)

In [8]:
print('Cross validation test on non binarized data; thresh: 0.53\n')
cross_val_test('dataset/student-mat.csv', binarize=False)

Cross validation test on non binarized data; thresh: 0.53

Decision Tree scores:
           accuracy: 0.610
           precision: 0.646
           recall: 0.578
           F1: 0.605

K Nearest Neighbors scores:
           accuracy: 0.562
           precision: 0.577
           recall: 0.612
           F1: 0.589

Naive Bayes scores:
           accuracy: 0.638
           precision: 0.612
           recall: 0.844
           F1: 0.707

SVC scores:
           accuracy: 0.615
           precision: 0.593
           recall: 0.845
           F1: 0.695


In [9]:
print('Cross validation test on binarized data; thresh: 0.53\n')
cross_val_test('dataset/student-mat.csv')

Cross validation test on binarized data; thresh: 0.53

Decision Tree scores:
           accuracy: 0.593
           precision: 0.622
           recall: 0.588
           F1: 0.599

K Nearest Neighbors scores:
           accuracy: 0.562
           precision: 0.577
           recall: 0.612
           F1: 0.589

Naive Bayes scores:
           accuracy: 0.638
           precision: 0.612
           recall: 0.844
           F1: 0.707

SVC scores:
           accuracy: 0.615
           precision: 0.593
           recall: 0.845
           F1: 0.695


In [10]:
print('Cross validation test on non binarized data; thresh: 0.8\n')
cross_val_test('dataset/student-mat.csv', binarize=False, thresh=0.8)

Cross validation test on non binarized data; thresh: 0.8

Decision Tree scores:
           accuracy: 0.879
           precision: 0.000
           recall: 0.000
           F1: 0.000

K Nearest Neighbors scores:
           accuracy: 0.927
           precision: 0.000
           recall: 0.000
           F1: 0.000

Naive Bayes scores:
           accuracy: 0.689
           precision: 0.200
           recall: 0.402
           F1: 0.175

SVC scores:
           accuracy: 0.927
           precision: 0.000
           recall: 0.000
           F1: 0.000


In [11]:
print('Cross validation test on binarized data; thresh: 0.8\n')
cross_val_test('dataset/student-mat.csv', binarize=True, thresh=0.8)

Cross validation test on binarized data; thresh: 0.8

Decision Tree scores:
           accuracy: 0.896
           precision: 0.104
           recall: 0.071
           F1: 0.084

K Nearest Neighbors scores:
           accuracy: 0.927
           precision: 0.000
           recall: 0.000
           F1: 0.000

Naive Bayes scores:
           accuracy: 0.246
           precision: 0.084
           recall: 0.933
           F1: 0.154

SVC scores:
           accuracy: 0.927
           precision: 0.000
           recall: 0.000
           F1: 0.000


In [12]:
print('Cross validation test on non binarized data; thresh: 0.4\n')
cross_val_test('dataset/student-mat.csv', binarize=False, thresh=0.4)

Cross validation test on non binarized data; thresh: 0.4

Decision Tree scores:
           accuracy: 0.734
           precision: 0.852
           recall: 0.808
           F1: 0.827

K Nearest Neighbors scores:
           accuracy: 0.823
           precision: 0.844
           recall: 0.955
           F1: 0.895

Naive Bayes scores:
           accuracy: 0.805
           precision: 0.837
           recall: 0.939
           F1: 0.884

SVC scores:
           accuracy: 0.792
           precision: 0.803
           recall: 0.978
           F1: 0.882


In [13]:
print('Cross validation test on binarized data; thresh: 0.4\n')
cross_val_test('dataset/student-mat.csv', binarize=True, thresh=0.4)

Cross validation test on binarized data; thresh: 0.4

Decision Tree scores:
           accuracy: 0.757
           precision: 0.851
           recall: 0.843
           F1: 0.845

K Nearest Neighbors scores:
           accuracy: 0.805
           precision: 0.834
           recall: 0.942
           F1: 0.885

Naive Bayes scores:
           accuracy: 0.276
           precision: 0.989
           recall: 0.090
           F1: 0.148

SVC scores:
           accuracy: 0.810
           precision: 0.822
           recall: 0.971
           F1: 0.890
