# Alcohol

## reference

In [71]:
# https://www.kaggle.com/datasets/whenamancodes/alcohol-effects-on-study

## Import Libraries

In [117]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, RocCurveDisplay, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder

## Import Data

In [73]:
path1 = 'https://raw.githubusercontent.com/notfakearcher/julian/main/02_data/portuguese.csv'
path2 = 'https://raw.githubusercontent.com/notfakearcher/julian/main/02_data/maths.csv'
portuguese = pd.read_csv(path1)
math = pd.read_csv(path2)

In [74]:
portuguese.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [75]:
math.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


## join the two data together

In [110]:
m,n = math.shape
m1,n1 = portuguese.shape

#  add each subject to each profile 
subject = ['math']*m
math['subject'] = subject

subject1 = ['portuguese']*m1
portuguese['subject'] = subject1

# join the two dataframe together
p_m = pd.concat([math, portuguese])
p_m

# shuffle all rows in the pM dataframe
p_m = p_m.sample(frac = 1).reindex()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,subject
489,MS,M,18,R,GT3,T,1,1,other,other,...,4,3,3,4,4,0,8,9,10,portuguese
191,GP,F,17,U,GT3,T,1,1,at_home,services,...,3,3,1,1,3,0,8,8,9,math
188,GP,F,16,U,LE3,T,3,3,other,other,...,4,5,1,1,4,0,14,14,15,portuguese
301,GP,M,17,U,LE3,T,4,4,other,teacher,...,1,1,2,2,5,0,11,11,10,math
359,MS,F,18,U,LE3,T,1,1,at_home,services,...,3,2,1,1,4,0,18,16,16,math
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,MS,M,16,R,GT3,T,1,2,other,other,...,3,3,1,1,5,0,10,11,11,portuguese
85,GP,F,15,U,GT3,T,4,4,services,services,...,4,4,2,3,5,6,7,9,8,math
321,GP,F,17,U,GT3,T,2,2,other,other,...,2,2,1,1,3,12,11,9,9,math
53,GP,F,15,U,GT3,T,4,4,services,services,...,3,4,2,3,5,0,8,10,11,math


## Split variables and targets

In [106]:
cond1 = p_m.columns == 'G1'
cond2 = p_m.columns == 'G2'
cond3 = p_m.columns == 'G3'
v_n = p_m.columns[~cond1 & ~cond2 & ~cond3] 
X = p_m[v_n]


In [128]:
y_name = ['G1','G2','G3']
y = p_m[y_name]


## change type in X

In [133]:
# random forest can't cope with string, convert string to int category
object_name = X.select_dtypes(object).columns
X[object_name] = X.select_dtypes(object).apply(LabelEncoder().fit_transform)

## Split the data into training and test

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

## Feature selection

In [147]:
# random forest to select features
rf = RandomForestClassifier(n_estimators = 500, criterion = 'gini')
model = SelectFromModel(rf.fit(X_train, y_train))

In [151]:
model.get_support()

array([False, False,  True, False, False, False,  True,  True,  True,
        True,  True, False, False,  True, False, False, False, False,
       False, False, False, False, False,  True,  True,  True, False,
        True,  True,  True,  True])

In [164]:
# important features
important_f = X_train.columns[model.get_support()]

## Remove unimportant features

In [165]:
X1_train = X_train[important_f]
y1_train = y_train
X1_test = X_test[important_f]
y1_test = y_test

## Bagging 