#### Notes

* both schools are located in Portugal
* age of majority in Portugal is 18 (like in US)
* a score of 10 is the lowest possible passing grade

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support

In [None]:
df_math = pd.read_csv("student-mat.csv")
df_math['pass'] = [1 if score >= 10 else 0 for score in df_math.G3]
df_math.columns

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(
  data=df_math,
  x='Walc',
  hue='pass',
  shrink=0.8,
  discrete=True,
  multiple='dodge'
)
plt.xlabel('Weekend Alcohol Consumption')

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(
  data=df_math,
  x='Dalc',
  hue='pass',
  shrink=0.8,
  discrete=True,
  multiple='dodge'
)
plt.xlabel('Workday Alcohol Consumption')

In [None]:
df = df_math.copy()

df.failures = (df.failures - df.failures.mean()) / df.failures.std() # number of past class failures
df.higher = [1 if e == 'yes' else 0 if e == 'no' else None for e in df.higher] # wants to take higher education
df.absences = (df.absences - df.absences.mean()) / df.absences.std() # number of school absences
df.goout = (df.goout - df.goout.mean()) / df.goout.std() # going out with friends

In [None]:
X = df[['failures', 'higher', 'absences', 'goout']]
y = df['pass']

lm = LogisticRegression(class_weight={0:0.65, 1:0.25, 2:0.05, 3:0.05,})

lm.fit(X, y)
y_pred = lm.predict(X)

display('predicted range: [{0:.2f},{1:.2f}]'.format(min(y_pred), max(y_pred)))

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[1,0])
display('precision = {}'.format(p[0]))
display('recall = {}'.format(r[0]))
display('f-score = {}'.format(f[0]))

In [None]:
df_por = pd.read_csv("student-por.csv")
df_por['pass'] = [1 if score >= 10 else 0 for score in df_por.G3]
df_por.columns

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(
  data=df_por,
  x='Walc',
  hue='pass',
  shrink=0.8,
  discrete=True,
  multiple='dodge'
)
plt.xlabel('Weekend Alcohol Consumption')

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(
  data=df_por,
  x='Dalc',
  hue='pass',
  shrink=0.8,
  discrete=True,
  multiple='dodge'
)
plt.xlabel('Weekend Alcohol Consumption')

In [None]:
df = df_por.copy()

df.failures = (df.failures - df.failures.mean()) / df.failures.std() # number of past class failures
df.higher = [1 if e == 'yes' else 0 if e == 'no' else None for e in df.higher] # wants to take higher education
df.absences = (df.absences - df.absences.mean()) / df.absences.std() # number of school absences
df.goout = (df.goout - df.goout.mean()) / df.goout.std() # going out with friends

In [None]:
X = df[['failures', 'higher', 'absences', 'goout']]
y = df['pass']

lm = LogisticRegression(class_weight={0:0.65, 1:0.25, 2:0.05, 3:0.05,})

lm.fit(X, y)
y_pred = lm.predict(X)

display('predicted range: [{0:.2f},{1:.2f}]'.format(min(y_pred), max(y_pred)))

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[1,0])
display('precision = {}'.format(p[0]))
display('recall = {}'.format(r[0]))
display('f-score = {}'.format(f[0]))