In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
import pickle

In [21]:
def read_data():
    data = pd.read_csv('student-mat.csv', delimiter=';')
    print(data['famsup'])
    return data

In [3]:
def split_data(X, Y):
    return train_test_split(X, Y, test_size=0.2, random_state=42)

In [4]:
def confuse(y_t, y_p):
    confuse_matrix = confusion_matrix(y_t, y_p)
    fpr(confuse_matrix)
    ffr(confuse_matrix)

In [5]:
def fpr(confuse_matrix):
    fp = confuse_matrix[0][1]
    tf = confuse_matrix[0][0]
    rate = float(fp) / (fp + tf)
    print("False pass Rate :", rate)
    
def ffr(confuse_matrix):
    ff = confuse_matrix[1][0]
    tp = confuse_matrix[1][1]
    rate = float(ff) / (ff + tp)
    print("False fail Rate :", rate)

In [6]:
def train_and_score(X, y, name):
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    clf = Pipeline([
        ('reduce_dim', SelectKBest(chi2, k=2)),
        ('train', LinearSVC(C=100)),
    ])
    
    scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2)
    print("Mean Model Accuracy", np.array(scores).mean())
    
    clf.fit(X_train, y_train)
    
    confuse(y_test, clf.predict(X_test))
    
    filename = "model" + name + ".pkl"
    with open(filename,"wb") as f:
        pickle.dump(clf,f)
    
    print()

In [22]:
def main():
    print("Student Performance Predictions")
    
    label_encoder = LabelEncoder()
    data = read_data()
    print(data.columns)
    final_data = data.drop(['school', 'nursery', 'guardian', 'reason', 'schoolsup','address'], axis=1)
    for column in final_data[['sex', 'famsize', 'Pstatus', 'Mjob', 
                        'Fjob', 'famsup', 'paid', 'activities', 'higher', 'internet', 'romantic']]:
        final_data[column] = label_encoder.fit_transform(final_data[column].values)
    print(final_data['famsup'])
    with open('label_encoder.pkl','wb') as f:
        pickle.dump(label_encoder, f)
        
    for i, row in final_data.iterrows():
        if row["G1"] >= 10:
            final_data["G1"][i] = 1
        else:
            final_data["G1"][i] = 0

        if row["G2"] >= 10:
            final_data["G2"][i] = 1
        else:
            final_data["G2"][i] = 0

        if row["G3"] >= 10:
            final_data["G3"][i] = 1
        else:
            final_data["G3"][i] = 0
            
    y = final_data.pop("G3")
    X = final_data
    
    print("\n\nModel Accuracy Knowing G1 & G2 Scores")
    print("=====================================")
    mod_1 = train_and_score(X, y, "G1G2")

    # Remove grade report 2
    X.drop(["G2"], axis = 1, inplace=True)
    print("\n\nModel Accuracy Knowing Only G1 Score")
    print("=====================================")
    train_and_score(X, y, "G1")

    # Remove grade report 1
    X.drop(["G1"], axis=1, inplace=True)
    print("\n\nModel Accuracy Without Knowing Scores")
    print("=====================================")
    train_and_score(X, y, "N")



main()
        

Student Performance Predictions
0       no
1      yes
2       no
3      yes
4      yes
      ... 
390    yes
391     no
392     no
393     no
394     no
Name: famsup, Length: 395, dtype: object
Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')
0      0
1      1
2      0
3      1
4      1
      ..
390    1
391    0
392    0
393    0
394    0
Name: famsup, Length: 395, dtype: int32


Model Accuracy Knowing G1 & G2 Scores


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

Mean Model Accuracy 0.9207341269841269
False pass Rate : 0.037037037037037035
False fail Rate : 0.11538461538461539



Model Accuracy Knowing Only G1 Score
Mean Model Accuracy 0.8354662698412699
False pass Rate : 0.07407407407407407
False fail Rate : 0.19230769230769232



Model Accuracy Without Knowing Scores




Mean Model Accuracy 0.6199900793650793
False pass Rate : 0.2222222222222222
False fail Rate : 0.6730769230769231





In [23]:
with open("modelN.pkl","rb") as f:
    model = pickle.load(f)
    print(model)

Pipeline(memory=None,
         steps=[('reduce_dim',
                 SelectKBest(k=2,
                             score_func=<function chi2 at 0x0000019960787288>)),
                ('train',
                 LinearSVC(C=100, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0))],
         verbose=False)


In [24]:
list1 = [ 0, 21,  0,  1,  4,  4,  4,  5,  4,  4,  2,  1,  1,  0,  1,  1,  1,  4,  3,  2,  1,  1,  4,  5]
cat_val = np.array(list1).reshape(1,-1)
model.predict(cat_val)


array([0], dtype=int64)