# Group B15 - Kaggle Student Alcohol consumption

## Predicting student's final grade 

 Otto Bruno Koobakene, Liivika Koobakene


## Imports

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error

## Processing data

Merging Portuguese and Math student's data

In [2]:
df = pd.concat(
    map(pd.read_csv, ['student-mat.csv', 'student-por.csv']), ignore_index=True)


Adding new column for average alcohol consumption per week

In [3]:
df['weeklyAlcohol'] = (df.Dalc + df.Walc)/2

Deleting unnecessary columns

In [4]:
df = df.drop(['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','guardian','traveltime','studytime','failures','schoolsup','activities','nursery','higher','Dalc','Walc','internet','romantic','freetime','goout','health','G1','G2'
], axis=1)
df = df.rename(columns={'G3':'finalgrade'})

Generating columns for "yes or no" questions

In [5]:
df = pd.concat([df.drop('famsup', axis=1), pd.get_dummies(df['famsup'])], axis=1)
df = df.rename(columns = {'no': 'no_famsup', 'yes': 'famsup'})
df = pd.concat([df.drop('paid', axis=1), pd.get_dummies(df['paid'])], axis=1)
df = df.rename(columns = {'no': 'no_paid', 'yes': 'paid'})

Generating new merged csv file

In [6]:
df.to_csv('merged.csv')
df

Unnamed: 0,famrel,absences,finalgrade,weeklyAlcohol,no_famsup,famsup,no_paid,paid
0,4,6,6,1.0,1,0,1,0
1,5,4,6,1.0,0,1,1,0
2,4,10,10,2.5,1,0,0,1
3,3,2,15,1.0,0,1,0,1
4,4,4,10,1.5,0,1,0,1
...,...,...,...,...,...,...,...,...
1039,5,4,10,1.5,1,0,1,0
1040,4,4,16,1.0,0,1,1,0
1041,1,6,9,1.0,1,0,1,0
1042,2,6,10,3.5,1,0,1,0


## Learning models

Training and test sets

In [7]:
# X is the input set and y is the output set
X,y = df.drop(columns=["finalgrade",'weeklyAlcohol','famrel'], axis=1), df.finalgrade

# Creating training and test sets
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=0)

##### Lasso regression

In [8]:
lasso_reg = Lasso().fit(X_train, y_train)
lasso_pred = lasso_reg.predict(X_test)
print(lasso_pred)
lasso_MAE = mean_absolute_error(y_test, lasso_pred)
print("Mean absolute error for Lasso regression: " , round(lasso_MAE,5))

[11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068 11.29315068
 11.29315068 11.29315068 11.29315068 11.29315068 11

In [9]:
linear = LinearRegression().fit(X_train, y_train)
pred_linear = linear.predict(X_test)
print(pred_linear)
pred_linear[pred_linear > 20] = 20
pred_linear[pred_linear < 0] = 0

mae_linear = mean_absolute_error(y_test, pred_linear)
print ("Linear regression mean absolute error (MAE):", round(mae_linear,4))

[11.21878682 11.46724013 11.50279138 10.91801981 10.90024419 11.520567
 11.48501576 10.87358076 10.70511774 11.36099179 11.28948392 11.35210398
 11.26322587 11.36099179 11.46724013 10.68734212 11.41391327 11.46724013
 10.90024419 11.48501576 11.520567   11.50279138 10.91801981 11.36099179
 11.48501576 11.48501576 11.36099179 10.72289336 11.36099179 11.36099179
 11.32544055 11.50279138 11.50279138 11.23656244 11.34321617 10.90024419
 11.30766493 11.42280108 11.37836203 11.34321617 11.32544055 11.520567
 10.65179088 11.36099179 11.23656244 11.44946451 10.81136609 11.50279138
 11.28988931 11.29877712 11.520567   11.36099179 11.21878682 11.43168889
 11.48501576 11.27211368 11.520567   11.48501576 11.43168889 11.38724984
 10.68693674 10.68734212 11.520567   11.520567   11.520567   11.32544055
 11.46724013 10.83802952 11.520567   11.51167919 11.32544055 10.84691733
 11.520567   10.86469295 11.36099179 10.86469295 10.7584446  10.68734212
 11.520567   11.28988931 11.520567   11.32544055 11.325

In [12]:
neighbors= KNeighborsRegressor().fit(X_train, y_train)
pred_neighbors = neighbors.predict(X_test)
pred_neighbors[pred_neighbors > 20] = 20
pred_neighbors[pred_neighbors < 0] = 0
print(pred_neighbors)
mae_neighbors = mean_absolute_error(y_test, pred_neighbors)
print ("Nearest neighbors mean absolute error (MAE):", round(mae_neighbors,4))

[10.8 11.6 13.   7.2 12.6 11.6 10.8 11.  13.   9.2 11.6 12.6 12.2  9.2
 11.6 13.2 10.8 11.6 12.6 10.8 11.6 13.   7.2  9.2 10.8 10.8  9.2 12.4
  9.2  9.2 11.6 13.  13.   9.2 12.  12.6 11.4 11.  10.8 12.  11.6 11.6
 11.   9.2  9.2 11.8 11.6 13.  10.8 14.4 11.6  9.2 10.8 12.  10.8 13.4
 11.6 10.8 12.  11.6 12.6 13.2 11.6 11.6 11.6 11.6 11.6 10.8 11.6 12.2
 11.6 11.4 11.6 12.6  9.2 12.6  7.  13.2 11.6 10.8 11.6 11.6 11.6 11.8
 13.  11.6 11.6 12.  12.6  7.  10.8 12.  12.  10.8 11.6 10.8 11.6 11.6
 11.6 11.6 13.  11.6 12.4 11.8 11.6  9.2 11.6 10.6  9.2 10.8 11.8 11.6
 11.6 11.6 10.8 10.8 11.8 11.6 11.6  9.2 11.8 12.6 12.  11.6 12.  10.8
 11.2 11.4 12.6 11.6 11.  10.8 11.6  9.2  9.2 12.8 11.   9.2 11.6 10.8
 11.  12.4 13.4 11.8 13.  13.8 10.8 13.  10.8 10.8 11.6  7.2  7.2 11.6
 11.6  9.6  8.8 13.   9.2  9.2  7.2 10.8 13.8 11.6 11.6 13.   7.2  9.2
 13.   7.2 11.6 12.6 11.6 10.8  9.2 11.8 13.  11.6 11.6 13.  13.  13.
 13.   7.2 14.4  7.2 11.6 12.  10.8 11.6 11.4  9.2 12.6  9.2  9.2  9.4
 12.  1