In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
from category_encoders import OneHotEncoder

## Przygotowanie danych 

In [None]:
data=pd.DataFrame(pd.read_json('https://api.apispreadsheets.com/api/dataset/school-grades/',orient='split'))
data.info()

## Inżynieria cech 
### Kodowanie kategoryczne i dodanie kolumny "Czy dostał 0 pkt"

Chcąc lepiej przewidywać wyniki postanowiliśmy inaczej potraktować osoby z zerowym wynikiem. Nasz plan jest taki, aby najpierw przewidywać czy osoba dostanie 0 pkt, a następnie, jeśli z przewidywania wyjdzie że nie powinna dostać 0 pkt, przewidujemy jej wynik.

In [None]:
data['schoolsup'].describe()

Wyrzucamy również kolumne dotyczącą edukacji ojca, gdyż z EDA wyszło nam że jest ona mocno skorelowana z edukacją matki. Postanowiliśmy wyrzucić jedną z tych cech.

In [None]:
data = data.drop(labels = "Fedu", axis = 1)
data = data.drop(labels = "Fjob", axis = 1)

ce_one_hot = OneHotEncoder(cols = ("Mjob", "sex", "school",'famsize',
                                   "address", "Pstatus", "reason",
                                   "guardian", "schoolsup",'famsup', 'paid',
                                   'activities', 'nursery', 'higher', 'internet', 'romantic' ))
df = ce_one_hot.fit_transform(data)

In [None]:
a=df.shape[1]-3
df_G=df.iloc[:,:a]

a=df.shape[1]-1
df=df.iloc[:,:a]
data['is_zero']=np.where(data['G3']==0,0,1)

df.head()

## Modele

In [None]:
## Funkcja pomocnicza do testowania modeli 

def models_score(model_params, x_train, y_train, x_test, y_test):
    results=[]
    for name, param in model_params.items():
        classifier=GridSearchCV(param['model'],param['params'],cv=6,return_train_score=False)
        classifier.fit(x_train,  y_train)
        model_performance = np.sqrt(mean_squared_error(classifier.predict(x_test),y_test, squared=True))
        results.append({
            'model': name,
            'best_score': classifier.best_score_,
            'best_params': classifier.best_params_,
            'RMSE': model_performance
        })
    return results

def draw_hists(model, x_test, y_test):
    fig, ax =plt.subplots(1,2, sharey=True, figsize=(12,6))
    sns.histplot(clf.predict(x_test), ax=ax[0])
    sns.histplot(y_test, ax=ax[1])
    ax[0].set_title('Rozkład przewidywany')
    ax[1].set_title('Rozkład rzeczywisty')
    fig.show()

### Regrasja bez podziału na uczniów, którzy dostali 0pkt

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df, data["G3"],test_size=0.3, random_state=0)
x_train_G, x_test_G, y_train_G, y_test_G = train_test_split(df_G, data["G3"],test_size=0.3, random_state=0)

model_params1={
    'linear_regression':{
        'model': linear_model.LinearRegression(),
        'params': {}
    },
    'lasso':{
        'model': linear_model.Lasso(random_state=15),
        'params':{
            'alpha':[0.1,1,0],
            'normalize':['True','False']
        }
    },
    'svm':{
            'model': svm.SVC(gamma='auto', tol=1e-1, cache_size=2000, max_iter=1500),
            'params': {
                'C': [0.001, 0.01, 0.1, 1, 3, 10, 30],
                'kernel': ['linear', 'rbf']
            }
    }
}

#### Wyniki z G1 i G2

In [None]:
results= models_score(model_params1, x_train, y_train, x_test, y_test)
results

In [None]:
clf = linear_model.Lasso(alpha=0, normalize = True).fit(x_train, y_train)
draw_hists(clf, x_test, y_test)

#### Wyniki bez G1 i G2

In [None]:
results= models_score(model_params1, x_train_G, y_train_G, x_test_G, y_test_G)
results

In [None]:
clf = linear_model.Lasso(alpha=0, normalize = True).fit(x_train_G, y_train_G)
draw_hists(clf, x_test_G, y_test_G)

### Klasyfikacja uczniów z zerowym wynikiem 

In [None]:
 x_train2, x_test2, y_train2, y_test2 = train_test_split(df, data["is_zero"],test_size=0.3, random_state=13)

In [None]:

from sklearn.linear_model import LogisticRegression

model_params2={
    'random_forest':{
        'model': RandomForestClassifier(random_state=15),
        'params': {
            'n_estimators':[1,3,8,13],
            'min_samples_split': [3,5,10,20]
        }
    },
    'logistic_regession':{
        'model': LogisticRegression(random_state=15, solver='liblinear'),
        'params':{
            'C':[1,5,10],
            'penalty':['l1','l2','elasticnet']
        }
    },
    'svm':{
            'model': svm.SVC(gamma='auto', tol=1e-1, cache_size=2000, max_iter=1500),
            'params': {
                'C': [0.001, 0.01, 0.1, 1, 3, 10, 30],
                'kernel': ['linear', 'rbf']
            }
    }
}

In [None]:
results_isZero= models_score(model_params2, x_train2, y_train2, x_test2, y_test2)
results_isZero

### Regrasja tylko dla uczniów, którzy nie dostali zero punktów 

In [None]:
only_good_data=df.loc[data.is_zero==1]
data2=data.loc[data.is_zero==1]
x_train3, x_test3, y_train3, y_test3 = train_test_split(only_good_data, data2["G3"],test_size=0.3, random_state=0)

In [None]:
model_params3={
    'linear_regression':{
        'model': linear_model.LinearRegression(),
        'params': {}
    },
    'lasso':{
        'model': linear_model.Lasso(random_state=15),
        'params':{
            'alpha':[0.1,1,0],
            'normalize':['True','False']
        }
    }
}

In [None]:
results3= models_score(model_params3, x_train3, y_train3, x_test3, y_test3)
results3                                                      

In [None]:
clf = linear_model.LinearRegression().fit(x_train3, y_train3)
draw_hists(clf, x_test3, y_test3)

# Model z kubełkami 


In [None]:
data_bins = data.drop(labels = "is_zero", axis = 1)

In [None]:
## Podział na kubełki 
def divade(x, n):
    return (x//n) * n 

df_bins = data_bins["G3"].apply(lambda x: divade(x, 4))

x_train4, x_test4, y_train4, y_test4 = train_test_split(df, df_bins,test_size=0.3, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

model_params4={
    'random_forest':{
        'model': RandomForestClassifier(random_state=15),
        'params': {
            'n_estimators':[1,3,8,13],
            'min_samples_split': [3,5,10,20]
        }
    },
    'logistic_regession':{
        'model': LogisticRegression(random_state=15, solver='liblinear'),
        'params':{
            'C':[1,5,10],
            'penalty':['l1','l2','elasticnet']
        }
    },
    'svm':{
            'model': svm.SVC(gamma='auto', tol=1e-1, cache_size=2000, max_iter=1500),
            'params': {
                'C': [0.001, 0.01, 0.1, 1, 3, 10, 30],
                'kernel': ['linear', 'rbf']
            }
    }
}

In [None]:
results4= models_score(model_params4, x_train4, y_train4, x_test4, y_test4)
results4   

In [None]:
clf = svm.SVC(gamma='auto', tol=1e-1, cache_size=2000, max_iter=1500, C = 5, kernel = 'linear').fit(x_train4, y_train4)
draw_hists(clf, x_test4, y_test4)