# Data Science and Machine Learning Capstone Project Notebook

In [38]:
# Import Statements
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import accuracy_score

In [2]:
# Data Description
with open('student.txt') as f:
    print(f.read())

# Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
1 school - student's school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)
2 sex - student's sex (binary: "F" - female or "M" - male)
3 age - student's age (numeric: from 15 to 22)
4 address - student's home address type (binary: "U" - urban or "R" - rural)
5 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7 Medu - mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)
8 Fedu - father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)
9 Mjob - mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "

In [3]:
data = pd.read_csv('student-por.csv', delimiter = ';')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      649 non-null    object
 1   sex         649 non-null    object
 2   age         649 non-null    int64 
 3   address     649 non-null    object
 4   famsize     649 non-null    object
 5   Pstatus     649 non-null    object
 6   Medu        649 non-null    int64 
 7   Fedu        649 non-null    int64 
 8   Mjob        649 non-null    object
 9   Fjob        649 non-null    object
 10  reason      649 non-null    object
 11  guardian    649 non-null    object
 12  traveltime  649 non-null    int64 
 13  studytime   649 non-null    int64 
 14  failures    649 non-null    int64 
 15  schoolsup   649 non-null    object
 16  famsup      649 non-null    object
 17  paid        649 non-null    object
 18  activities  649 non-null    object
 19  nursery     649 non-null    object
 20  higher    

## 1. Data Cleaning/Preprocessing

(a) Drop the 'school' column as I want to find general trends of student performance across different schools.

(b) Drop 'Mjob', 'Fjob', 'reason' and 'guardian' as they would add many dimensions if encoded.

(c) Drop 'G1' and 'G2' as predicting them is not this project's aim.

(d) Convert the remaining binary objects to 0s and 1s.

(e) Save the target variable, G3, before dropping the column.

(f) Split the data into train/test.

In [5]:
X = data.drop('school', axis=1)
X = X.drop('Mjob', axis=1)
X = X.drop('Fjob', axis=1)
X = X.drop('reason', axis=1)
X = X.drop('guardian', axis=1)
X = X.drop('G1', axis=1)
X = X.drop('G2', axis=1)

In [6]:
X['sex'] = X['sex'].map({'M': 1, 'F': 0})
X['address'] = X['address'].map({'U': 1, 'R': 0})
X['famsize'] = X['famsize'].map({'LE3': 1, 'GT3': 0})
X['Pstatus'] = X['Pstatus'].map({'T': 1, 'A': 0})

encode_list = ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

for string in encode_list:
    X[string] = X[string].map({'yes': 1, 'no': 0})

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   sex         649 non-null    int64
 1   age         649 non-null    int64
 2   address     649 non-null    int64
 3   famsize     649 non-null    int64
 4   Pstatus     649 non-null    int64
 5   Medu        649 non-null    int64
 6   Fedu        649 non-null    int64
 7   traveltime  649 non-null    int64
 8   studytime   649 non-null    int64
 9   failures    649 non-null    int64
 10  schoolsup   649 non-null    int64
 11  famsup      649 non-null    int64
 12  paid        649 non-null    int64
 13  activities  649 non-null    int64
 14  nursery     649 non-null    int64
 15  higher      649 non-null    int64
 16  internet    649 non-null    int64
 17  romantic    649 non-null    int64
 18  famrel      649 non-null    int64
 19  freetime    649 non-null    int64
 20  goout       649 non-null    int6

In [8]:
X.head()

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
0,0,18,1,0,0,4,4,2,2,0,...,0,0,4,3,4,1,1,3,4,11
1,0,17,1,0,1,1,1,1,2,0,...,1,0,5,3,3,1,1,3,2,11
2,0,15,1,1,1,1,1,1,2,0,...,1,0,4,3,2,2,3,3,6,12
3,0,15,1,0,1,4,2,1,3,0,...,1,1,3,2,2,1,1,5,0,14
4,0,16,1,0,1,3,3,1,2,0,...,0,0,4,3,2,1,2,5,0,13


In [9]:
y = X['G3']
X = X.drop('G3', axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

## 2. Data Scaler

I need to scale the data for the regression model to work properly. Otherwise, the 'absences' feature (ranging from 0 to 93) would contribute much more to the distance metric's calculation than the binary features (consisting of 0s and 1s). To prevent any infomation about the testing data from getting into the model, I will scale with respect to the training data only.

In [11]:
scaler = MinMaxScaler()

In [12]:
X_train = scaler.fit_transform(X_train)

In [13]:
X_test = scaler.transform(X_test)

## 3. Benchmark Model

My benchmark model is $5*$studytime. The data has been scaled, however. Therefore the code below multiplies by 20 instead.

In [14]:
benchmark_train = 20*X_train[:,8]
benchmark_test = 20*X_test[:,8]

## 4. Training the Model and Cross Validation

In [16]:
sgd = SGDRegressor()

In [17]:
sgd.fit(X_train, y_train)

SGDRegressor()

In [19]:
cross_val_score(sgd, X_train, y_train, scoring='max_error', cv=5)

array([-12.76761167, -10.17988211,  -8.93914599,  -7.42315375,
        -8.18402986])

In [20]:
cross_val_score(sgd, X_train, y_train, scoring='explained_variance')

array([0.13298086, 0.24435052, 0.29169696, 0.28185017, 0.2469609 ])

### Comment:

The max_error and explained_variance scores are roughly the same across different folds of the cross validation. This suggests that the model is not overfitting and can be applied to the testing data without any need for changes to the hyper-parameters.

## 5. Results

The results are not very close. The model seems to place most students in the middle. So lots of students do very well but don't have high predictions, and lots of students do very badly but with mediocre predictions. This suggests that the model is just 'averaging' through. So, this model isn't very good at actually predicting the grade. This is expected - academic preformance is likely to be linked to intelligence (which isn't measured here). As explained in the Capstone Proposal. Many of these features are self-reported (e.g. number of hours studied per week). This self-reporting is going to be less reliable than facts (e.g. number of past class failures or number of absences).

However, looking at the coefficients of this model does give insight into how the factors affect performance on average...

In [21]:
mean_squared_error(y_test, sgd.predict(X_test))

8.646222017539957

In [22]:
mean_squared_error(y_train, sgd.predict(X_train))

7.2042718392733756

In [23]:
mean_squared_error(y_test, benchmark_test)

65.37207977207977

In [24]:
mean_squared_error(y_train, benchmark_train)

64.19995105237396

### Comment:

The MSE of the test data is very similar to that of the train data. As expected, the model performs slightly better on the training set. There is a huge jump in MSE when going to the benchmark model's predictions. The benchmark model is also a linear function (5 x studytime) but performs much worse.

In [25]:
explained_variance_score(y_test, sgd.predict(X_test))

0.17047809704397532

In [26]:
explained_variance_score(y_train, sgd.predict(X_train))

0.30791767613103194

### Comment:

An explained variance score of 1 would mean that the student's attributes would perfectly determine their end of year score. This is highly unlikely to be the case; some of the data is self-reported, student intelligence and other important factors are not taken into account, and there will always be variation with how different students deal with exam pressure.

An explained variance score of 0 would mean that the given features have no influence on the student's grade. The above results suggest that the given data features do capture some infomation about student academic performance, but they are not enough to reasonably deploy this model for the purpose of predicting a student's grade between 0 and 20.

However, the regression coefficients may still provide insight as to how different factors influence a student's score on average.

In [29]:
sgd.coef_

array([-0.77669804,  0.97612399,  0.80947056,  0.57596717,  0.83471966,
        1.30198808,  1.14229681,  0.66009566,  1.41685581, -2.65331857,
       -0.66094773, -0.07927736, -0.59563171,  0.51609023, -0.01248447,
        3.07344865,  0.55126891, -0.27435992,  1.796828  , -0.10299755,
       -0.03726609, -1.22743639, -0.30775594, -0.18261804, -0.05906746])

***Negative Correlation***: sex (male), failures, schoolsup, famsup, paid, freetime, goout, Dalc, Walc, health, absences.

Female students on average outperform their male counterparts in Portuguese language. Students perform worse if they spend more time going out, drink more alcohol, and are absent from school more often. These are all expected. There is also a negative correlation for students who receive extra educational support from their family, school and paid classes. This is likely because students who receive this extra support are more likely to already be behind their peers.

Surprisingly, students who rated their health as better did worse on average. Perhaps this is because being in poor health means that the student is more likely to have extra time to study while their peers are more active. Alternatively, students who are feeling more exam pressure may rate their health poorly but perform well.

By far the most important statistic (with -2.65) was the number of previous class failures. The only other coefficient lower than -1 was Dalc - alcohol consumption on weekdays. While weekend alcohol consumption also negatively correlated with academic success, the weekday indicator seems to be much more important.


***Positive Correlation***: age, address (urban), famsize (LE3), Pstatus (together), Medu, Fedu, traveltime, studytime, activities, nursery, higher, internet, famrel.

Desire to pursue higher education gave the largest coefficient (with 3.07). This exceeded 'studytime' which was 1.42. Of course, study time is self-reported while aiming for higher education is not ambiguous. Quality of familial relations gave a coefficient of 1.80, making is appear more important than time spent studying. The final two coefficients above 1 were 'Medu' and 'Fedu' - level of education of the student's mother and father. This result is unsurprising.

Other unsurprising positive correlations include student age, whether their parents are still cohabiting, and whether or not the student attended nursery. Students living in urban environments did better, as did those with a home internet connection. Perhaps this is because these factors may indicate wealthier parents who are more likely to have a better education.

Surprisingly, the data suggests that students with a longer travel time perform better, though the coefficient is rather small at 0.66. Students who attended extra-ciricular activities also performed slightly better with a coefficient of 0.52. Lastly, students with a family size of less than or equal to 3 had more academic success. This could be because each child gets more parental attention, or it could be an indicator of the family's social class.

In [30]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   sex         649 non-null    int64
 1   age         649 non-null    int64
 2   address     649 non-null    int64
 3   famsize     649 non-null    int64
 4   Pstatus     649 non-null    int64
 5   Medu        649 non-null    int64
 6   Fedu        649 non-null    int64
 7   traveltime  649 non-null    int64
 8   studytime   649 non-null    int64
 9   failures    649 non-null    int64
 10  schoolsup   649 non-null    int64
 11  famsup      649 non-null    int64
 12  paid        649 non-null    int64
 13  activities  649 non-null    int64
 14  nursery     649 non-null    int64
 15  higher      649 non-null    int64
 16  internet    649 non-null    int64
 17  romantic    649 non-null    int64
 18  famrel      649 non-null    int64
 19  freetime    649 non-null    int64
 20  goout       649 non-null    int6

### Final Task

While the model coefficients have given insight into which factors affect student attainment, the results so far suggest that the model is not very good at giving actually predicting the student's score between 0 and 20. This is expected - making an accurate score prediction from the data given seems to be impossible. However, the model may fair better when it comes to simply categorising the students into pass/fail.

For these purposes, I will assume that a score less than 10 is a fail, and see how accurate the model is here.

In [33]:
train_predictions = sgd.predict(X_train)
test_predictions = sgd.predict(X_test)

In [39]:
print(f"Model predicts {sum(train_predictions>9)} passes on train data.")
print(f"Model predicts {sum(test_predictions>9)} passes on test data.")

Model predicts 405 passes on train data.
Model predicts 179 passes on test data.


In [40]:
print(f"There were {sum(y_train>9)} passes on the train data.")
print(f"There were {sum(y_test>9)} passes on the test data.")

There were 384 passes on the train data.
There were 165 passes on the test data.


In [41]:
train_accuracy = accuracy_score((y_train>9)*1, (train_predictions>9)*1)
print(f"Accuracy on the training data was: {train_accuracy}")

Accuracy on the training data was: 0.8480176211453745


In [42]:
test_accuracy = accuracy_score((y_test>9)*1, (test_predictions>9)*1)
print(f"Accuracy on the testing data was: {test_accuracy}")

Accuracy on the testing data was: 0.8358974358974359


### Comment:

This is a very encouraging result. While the model isn't very good at predicting the student's G3 score, it can predict with 84% accuracy whether or not the student will pass/fail, based purely on the data given in school reports and questionnaries. This could make it very easy for teachers to identify 'at risk' students and provide extra academic help as required.