In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from sklearn import metrics
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
np.random.seed(123)

## Encoding i transformacje

In [None]:
grades_df = pd.read_csv('school_grades_dataset.csv')
grades_df = grades_df[grades_df['G3'] != 0]

cat_cols = ['school', 'sex', 'address', 'famsize', 'Mjob', 'Fjob', 'reason', 'guardian', 'Pstatus', 'sex', 'school']
bin_cols = ['famsup', 'activities', 'nursery', 'internet', 'romantic', 'higher', 'paid', 'schoolsup']

grades_df_new = grades_df.drop(columns = (cat_cols + bin_cols))

for i in cat_cols:
    means = grades_df.groupby(i)['G3'].mean()
    grades_df_new[i] = grades_df[i].map(means)
    
for i in bin_cols:
    encoder = ce.OrdinalEncoder(mapping = [{'col': i, 'mapping': {'yes': 1, 'no': 0}},])
    grades_df_new[i] = encoder.fit_transform(grades_df)[i]

grades_df_new['result'] = pd.cut(grades_df_new['G3'],
                                 bins=[-1, 9, 11, 13, 15, 21],
                                 labels=['1', '2', '3', '4', '5'])

In [None]:
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE

# funkca do mierzenia poprawności

def simple_models(X_train, X_test, y_train, y_test):
    
    lr = LogisticRegression(random_state=1, max_iter=100)
    lr.fit(X_train, y_train)
    print(f'Logistic regression accuracy: {lr.score(X_test, y_test)}')
    selector = RFE(lr, n_features_to_select=7, step=1)
    selector = selector.fit(X_train, y_train)
    print(f'Logistic regression accuracy: {selector.score(X_test, y_test)}. (po zastosowaniu RFE)')
    
    tree_model = DecisionTreeClassifier()
    tree_model.fit(X_train, y_train)
    print(f'Decision Tree accuracy: {tree_model.score(X_test, y_test)}')
    selector = RFE(tree_model, n_features_to_select=7, step=1)
    selector = selector.fit(X_train, y_train)
    print(f'Decision Tree accuracy: {selector.score(X_test, y_test)}. (po zastosowaniu RFE)')
    
    
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    print(f'Random Forest accuracy: {rf.score(X_test, y_test)}')
    selector = RFE(rf, n_features_to_select=7, step=1)
    selector = selector.fit(X_train, y_train)
    print(f'Random Forest accuracy: {selector.score(X_test, y_test)}. (po zastosowaniu RFE)')
    
    
    #svc = SVC()
    #svc.fit(X_train,y_train)
    #print(f'SVC accuracy: {svc.score(X_test, y_test)}')
    #selector = RFE(svc, n_features_to_select=7, step=1)
    #selector = selector.fit(X_train, y_train)
    #print(f'SVC accuracy: {selector.score(X_test, y_test)}. (po zastosowaniu RFE)')
    
    xgb = xgboost.XGBClassifier(eval_metric = 'merror')
    xgb.fit(X_train,y_train)
    print(f'XGBoost accuracy: {xgb.score(X_test, y_test)}')
    selector = RFE(xgb, n_features_to_select=7, step=1)
    selector = selector.fit(X_train, y_train)
    print(f'XGBoost accuracy: {selector.score(X_test, y_test)}. (po zastosowaniu RFE)')
    

## Klasyfikacja konkretnego wyniku

Sprawdzimy możliwość przewidywania oceny końcowej na dwa sposoby: przewidywanie dokładniej oceny oraz przewidywnanie jej przedziału (kubełki 0-9, 10-11, 12-13, 14-15, 16-21).

Użyjemy też różnych sposobówprzewidywania to znaczy będziemy używać G1 i G2, które jest mocno skorelowane z G3 lub też nie.

### Dane łącznie z G1 i G2

In [None]:
X = grades_df_new.drop(['G3', 'result'], axis = 1)
y = grades_df_new['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
simple_models(X_train, X_test, y_train, y_test)

Jak widać modele radzą sobie bardzo słabo z odgadnięciem konkretnej liczby punktów zdobytej przez ucznia.

### Dane bez G1 i G2

In [None]:
X = grades_df_new.drop(['G1', 'G2', 'G3', 'result'], axis = 1)
y = grades_df_new['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
simple_models(X_train, X_test, y_train, y_test)

Bez tych danych jest w ogóle tragicznie.

### Regresja liniowa

Użyjmy regresji liniowej do przywidywania wyników na podstawie samych G1 i G2, które są mocno skorelowane z G3

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (12, 5))

sns.scatterplot(data = grades_df, x = 'G1', y = 'G3', ax = ax1)
sns.scatterplot(data = grades_df, x = 'G2', y = 'G3', ax = ax2)

plt.show()

# liniowa zalezcnosc miedzy G1, G2, i G3

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X = grades_df_new[['G1', 'G2']]
y = grades_df_new['G3']
linear_reg = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
linear_reg.fit(X_train, y_train)
y_test_predicted = linear_reg.predict(X_test)

print(f'RMSE: {mean_squared_error(y_test, y_test_predicted, squared = False)}')
print(f'R-squared: {linear_reg.score(X_test, y_test)}')

Trzeba pamiętać, że regresja liniowa przewiduje wartości ciągłe, spróbujemy zatem zaokrągliz wynik i sprawdźmy ile odpowiedzi zostało odgadniętych:

In [None]:
print(f'Odesetek dobrze predykowanych zaokrąglonych wyników:\
      {(linear_reg.predict(X_test).round() == y_test).sum() / len(y_test)}')

Nie jest to zachwycająca odpowiedź, ale lepsza od modeli klasyfikujących.

## Klasyfikacja przedziały wyniku

### Dane bez G1 i G2

In [None]:
X = grades_df_new.drop(['G1', 'G2', 'G3', 'result'], axis = 1)
y = grades_df_new['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
simple_models(X_train, X_test, y_train, y_test)

### Dane z G1 i G2

In [None]:
X = grades_df_new.drop(['G3', 'result'], axis = 1)
y = grades_df_new['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.3, random_state=42)

In [None]:
simple_models(X_train, X_test, y_train, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

C = np.arange(0, 2, 0.2)
class_weight = [None, 'balanced']
fit_intercept = [True, False]
l1_ratio = np.arange(0, 1, 0.1)
solver = ["newton-cg", "sag", "saga", "lbfgs", "liblinear"]

lr = LogisticRegression(random_state=1, max_iter=100)

param_grid = dict(C = C, class_weight = class_weight, fit_intercept = fit_intercept, l1_ratio = l1_ratio, solver = solver)
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv = 3, n_jobs=-1)
grid_result = grid.fit(X_train, y_train) #tutaj lepiej zastosować tylko trainset

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
best_model=grid_result.best_estimator_
score_max = 0

for i in range(30):
    selector = RFE(best_model, n_features_to_select=i+1, step=1)
    selector = selector.fit(X_train, y_train)
    if (selector.score(X_test, y_test) > score_max):
        feature_number = i+1
        selector_best = selector
        score_max = selector.score(X_test, y_test)

In [None]:
print(f'Wynik dla regresji logistycznej: {selector_best.score(X_test, y_test)}. (po zastosowaniu RFE dla {feature_number} zmiennych)')

In [None]:
y_pred = selector_best.predict(X_test)

from sklearn.metrics import f1_score
print(f'F1-score: {f1_score(y_test, y_pred, average = "weighted")}')

from sklearn.metrics import precision_score
print(f'Precision: {precision_score(y_test, y_pred, average = "weighted")}')

from sklearn.metrics import recall_score
print(f'Recall: {recall_score(y_test, y_pred, average = "weighted")}')

In [None]:
rf = RandomForestClassifier()

criterion = ['gini', 'balanced']
class_weight = ['balanced', 'balanced_subsample']
max_depth = [3, 4, 5]


param_grid = dict(criterion=criterion, class_weight=class_weight, max_depth=max_depth)
grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv = 3, n_jobs=-1)
grid_result = grid.fit(X_train, y_train) #tutaj lepiej zastosować tylko trainset

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

best_model=grid_result.best_estimator_
score_max = 0

for i in range(30):
    selector = RFE(best_model, n_features_to_select=i+1, step=1)
    selector = selector.fit(X_train, y_train)
    if (selector.score(X_test, y_test) > score_max):
        feature_number = i+1
        selector_best = selector
        score_max = selector.score(X_test, y_test)
        
y_pred = selector_best.predict(X_test)

from sklearn.metrics import f1_score
print(f'F1-score: {f1_score(y_test, y_pred, average = "weighted")}')

from sklearn.metrics import precision_score
print(f'Precision: {precision_score(y_test, y_pred, average = "weighted")}')

from sklearn.metrics import recall_score
print(f'Recall: {recall_score(y_test, y_pred, average = "weighted")}')

In [None]:
print(f'Wynik dla lasu losowego: {selector_best.score(X_test, y_test)}. (po zastosowaniu RFE dla {feature_number} zmiennych)')