# Predicting if students have a romantic interest with *k*-NN


In [1]:
import seaborn as sns
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split #We need this to split the data

## Data set

In [2]:
df = pd.read_csv("student-por.csv")
df.head(30) 

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
5,GP,M,16,U,LE3,T,4,3,services,other,...,5,4,2,1,2,5,6,12,12,13
6,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,4,1,1,3,0,13,12,13
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,4,1,4,1,1,1,2,10,13,13
8,GP,M,15,U,LE3,A,3,2,services,other,...,4,2,2,1,1,1,0,15,16,17
9,GP,M,15,U,GT3,T,3,4,other,other,...,5,5,1,1,1,5,0,12,12,13


In [3]:
df["romantic"].value_counts()

no     410
yes    239
Name: romantic, dtype: int64

Also have to make a dummy so I can actually predict the dependent variable "romantic"

In [4]:
dummies = pd.get_dummies(df["romantic"])
df = pd.concat([df, dummies], axis=1) #the axis=1 means: add it to the columns (axis=0 is rows)
df_subset = df.drop(["no"], axis=1)
df_subset = df_subset.rename(columns={"yes": "hasromantic"})
df_subset.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,hasromantic
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,4,0,11,11,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,2,9,11,11,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,6,12,13,12,0
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,0,14,14,14,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,0,11,13,13,0


To see which which variables have strong predictive variables I'm using corr to see which of them are the most useful: 

In [5]:
df_subset.corr()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,hasromantic
age,1.0,-0.107832,-0.12105,0.03449,-0.008415,0.319968,-0.020559,-0.00491,0.112805,0.134768,0.086357,-0.00875,0.149998,-0.174322,-0.107119,-0.106505,0.17881
Medu,-0.107832,1.0,0.647477,-0.265079,0.097006,-0.17221,0.024421,-0.019686,0.009536,-0.007018,-0.019766,0.004614,-0.008577,0.260472,0.264035,0.240151,-0.030992
Fedu,-0.12105,0.647477,1.0,-0.208288,0.0504,-0.165915,0.020256,0.006841,0.02769,6.1e-05,0.038445,0.04491,0.029859,0.217501,0.225139,0.2118,-0.067675
traveltime,0.03449,-0.265079,-0.208288,1.0,-0.063154,0.09773,-0.009521,0.000937,0.057454,0.092824,0.057007,-0.048261,-0.008149,-0.15412,-0.154489,-0.127173,0.004751
studytime,-0.008415,0.097006,0.0504,-0.063154,1.0,-0.147441,-0.004127,-0.068829,-0.075442,-0.137585,-0.214925,-0.056433,-0.118389,0.260875,0.240498,0.249789,0.033036
failures,0.319968,-0.17221,-0.165915,0.09773,-0.147441,1.0,-0.062645,0.108995,0.045078,0.105949,0.082266,0.035588,0.122779,-0.38421,-0.385782,-0.393316,0.069901
famrel,-0.020559,0.024421,0.020256,-0.009521,-0.004127,-0.062645,1.0,0.129216,0.089707,-0.075767,-0.093511,0.109559,-0.089534,0.048795,0.089588,0.063361,-0.04492
freetime,-0.00491,-0.019686,0.006841,0.000937,-0.068829,0.108995,0.129216,1.0,0.346352,0.109904,0.120244,0.084526,-0.018716,-0.094497,-0.106678,-0.122705,0.027112
goout,0.112805,0.009536,0.02769,0.057454,-0.075442,0.045078,0.089707,0.346352,1.0,0.245126,0.38868,-0.015741,0.085374,-0.074053,-0.079469,-0.087641,-0.00052
Dalc,0.134768,-0.007018,6.1e-05,0.092824,-0.137585,0.105949,-0.075767,0.109904,0.245126,1.0,0.616561,0.059067,0.172952,-0.195171,-0.18948,-0.204719,0.062042


The variables with the highest correlation and most useful are: 
    • age
    • famrel
    • studytime
    • failures
    • freetime
    • dalc
    • absences
    
I find the G1 to G3 very confusing and don't see relevance and therefore won't be using those variables.

## Building the model


In [9]:
X = df_subset[["age", "famrel", "failures", "studytime", "freetime", "Dalc", "absences"]]

y = df_subset["hasromantic"] #create the y-variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #split the data, store it into different variables

X_train.head(20) #show the head of the training set

Unnamed: 0,age,famrel,failures,studytime,freetime,Dalc,absences
358,18,3,0,1,2,1,8
74,16,4,0,2,3,2,4
640,18,5,1,1,4,4,0
423,16,4,0,1,3,1,11
61,16,5,0,1,5,5,0
201,16,4,0,2,2,1,0
274,17,4,0,2,3,1,10
76,15,3,0,4,4,1,0
600,17,4,0,1,2,3,4
360,18,4,0,2,1,1,8


## Model evaluation

In [11]:
from sklearn.neighbors import KNeighborsClassifier #the object class we need

knn = KNeighborsClassifier(n_neighbors=3) #create a KNN-classifier with 3 neighbors
knn = knn.fit(X_train, y_train) #this fits the k-nearest neigbor model with the train data
knn.score(X_test, y_test) #calculate the fit on the test data

0.6358974358974359

The accuracy is 63.6%. To see if it's better than a guess we can compare with the value counts done previously where 410 where not in a relationship and 239 where. 

In [12]:
410/(410+239)

0.6317411402157165

The model is slightly better than the baseline guess. But it might be better so I'm just going to try some neighbors.

In [29]:
from sklearn.metrics import classification_report

for i in range(1,11):
    knn_new = KNeighborsClassifier(n_neighbors = i) #make a new kNN model with i (1-10) neighbors
    knn_new = knn_new.fit(X_train, y_train) #fit new model on train data
    y_test_pred_new = knn_new.predict(X_test) #predict using new model, with test data
    print(f"With {i} neighbors the result is:")
    print(classification_report(y_test, y_test_pred_new))

With 1 neighbors the result is:
              precision    recall  f1-score   support

           0       0.67      0.72      0.69       128
           1       0.38      0.33      0.35        67

    accuracy                           0.58       195
   macro avg       0.53      0.52      0.52       195
weighted avg       0.57      0.58      0.58       195

With 2 neighbors the result is:
              precision    recall  f1-score   support

           0       0.65      0.88      0.75       128
           1       0.32      0.10      0.16        67

    accuracy                           0.62       195
   macro avg       0.49      0.49      0.45       195
weighted avg       0.54      0.62      0.55       195

With 3 neighbors the result is:
              precision    recall  f1-score   support

           0       0.70      0.77      0.74       128
           1       0.46      0.37      0.41        67

    accuracy                           0.64       195
   macro avg       0.58      0.5

In [25]:
knn = KNeighborsClassifier(n_neighbors=7) #create a KNN-classifier with 3 neighbors
knn = knn.fit(X_train, y_train) #this fits the k-nearest neigbor model with the train data
knn.score(X_test, y_test) #calculate the fit on the test data

0.6512820512820513

7 gives the better model and is better than both the previous number for k and the baseline guess so I'm going with 7. 

In [26]:
from sklearn.metrics import confusion_matrix
y_test_pred = knn.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[103,  25],
       [ 43,  24]], dtype=int64)

In [27]:
conf_matrix = pd.DataFrame(cm, index=["No Romanitic Interest", "Romanitic Interest"], columns = ["No Romanitic Interest (predicted)", "Romanitic Interest (predicted)"]) 
conf_matrix

Unnamed: 0,No Romanitic Interest (predicted),Romanitic Interest (predicted)
No Romanitic Interest,103,25
Romanitic Interest,43,24


We have already calculated the accuracy of the model so now all we need to do is calculate the precision and recall. 

### Precision:

This is the number of correctly predicted students who have a romantic interest, divided by the total number of predicted students who have a romantic interest. Remember: how "precise" am I in saying students have romantic interest?

I'm going to use the variable Romantic Interest to do the calculations. 

In [33]:
24/(25+24)

0.4897959183673469

The precision is pretty bad with only 48.9%

### Recall:

This is the number of correctly predicted students who have a romantic interest, divided by the total number of students who have a romantic interest. Remember: how many students who have a romantic interest do I "recall"?

In [34]:
24/(43+24)

0.3582089552238806

The recall is pretty bad with only 35.8%