In [None]:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import math
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
from category_encoders import OneHotEncoder

In [None]:
data=pd.DataFrame(pd.read_json('https://api.apispreadsheets.com/api/dataset/school-grades/',orient='split'))

In [None]:
data.info()

## Inżynieria cech 
### Kodowanie kategoryczne i dodanie kolumny "Czy dostał 0 pkt"

Chcąc lepiej przewidywać wyniki postanowiliśmy inaczej potraktować osoby z zerowym wynikiem. Nasz plan jest taki, aby najpierw przewidywać czy osoba dostanie 0 pkt, a następnie, jeśli z przewidywania wyjdzie że nie powinna dostać 0 pkt, przewidujemy jej wynik.

In [None]:
data['schoolsup'].describe()

Wyrzucamy również kolumne dotyczącą edukacji ojca, gdyż z EDA wyszło nam że jest ona mocno skorelowana z edukacją matki. Postanowiliśmy wyrzucić jedną z tych cech.

In [None]:
data = data.drop(labels = "Fedu", axis = 1)
data = data.drop(labels = "Fjob", axis = 1)

ce_one_hot = OneHotEncoder(cols = ("Mjob", "sex", "school",'famsize',
                                   "address", "Pstatus", "reason",
                                   "guardian", "schoolsup",'famsup', 'paid',
                                   'activities', 'nursery', 'higher', 'internet', 'romantic' ))
df = ce_one_hot.fit_transform(data)

In [None]:
a=df.shape[1]-3
df=df.iloc[:,:a]

In [None]:
data['is_zero']=np.where(data['G3']==0,0,1)

### Main grid search

Na początku dla przetestowania, czy osiągniemy lepsze efekty dzieląc modelowanie na dwa etapy sprawdzamy jakość wybranych modeli bez podziału.

In [None]:
 x_train2, x_test2, y_train2, y_test2 = train_test_split(df, data["G3"],test_size=0.3, random_state=0)

In [None]:
model_params2={
    'linear_regression':{
        'model': linear_model.LinearRegression(),
        'params': {}
    },
    'lasso':{
        'model': linear_model.Lasso(random_state=15),
        'params':{
            'alpha':[0.1,1,0],
            'normalize':['True','False']
        }
    },
    'svm':{
            'model': svm.SVC(gamma='auto', tol=1e-1, cache_size=2000, max_iter=1500),
            'params': {
                'C': [0.001, 0.01, 0.1, 1, 3, 10, 30],
                'kernel': ['linear', 'rbf']
            }
    }
}

In [None]:
results2=[]

from sklearn.metrics import mean_squared_error

for name, param in model_params2.items():
    classifier=GridSearchCV(param['model'],param['params'],cv=6,return_train_score=False)
    classifier.fit(x_train2,  y_train2)
    model_performance = np.sqrt(mean_squared_error(classifier.predict(x_test2),y_test2, squared=True))
    results2.append({
        'model': name,
        'best_score': classifier.best_score_,
        'best_params': classifier.best_params_,
        'RMSE': model_performance
    })
results2                                                           

In [None]:
sns.displot(classifier.predict(x_test2))

In [None]:
sns.displot(y_test2)

### Grid search: "Czy studenci dostali 0?"

In [None]:
 x_train, x_test, y_train, y_test = train_test_split(df, data["is_zero"],test_size=0.3, random_state=0)

In [None]:
from sklearn import svm

In [None]:
from sklearn.linear_model import LogisticRegression

model_params={
    'random_forest':{
        'model': RandomForestClassifier(random_state=15),
        'params': {
            'n_estimators':[1,3,8,13],
            'min_samples_split': [3,5,10,20]
        }
    },
    'logistic_regession':{
        'model': LogisticRegression(random_state=15, solver='liblinear'),
        'params':{
            'C':[1,5,10],
            'penalty':['l1','l2','elasticnet']
        }
    },
    'svm':{
            'model': svm.SVC(gamma='auto', tol=1e-1, cache_size=2000, max_iter=1500),
            'params': {
                'C': [0.001, 0.01, 0.1, 1, 3, 10, 30],
                'kernel': ['linear', 'rbf']
            }
    }
}

In [None]:
results=[]
for name, param in model_params.items():
    classifier=GridSearchCV(param['model'],param['params'],cv=6,return_train_score=False)
    classifier.fit(x_train,  y_train)
    results.append({
        'model': name,
        'best_score': classifier.best_score_,
        'best_params': classifier.best_params_
    })
results

In [None]:
only_good_data=df.loc[data.is_zero==1]

In [None]:
data2=data.loc[data.is_zero==1]

In [None]:
 x_train3, x_test3, y_train3, y_test3 = train_test_split(only_good_data, data2["G3"],test_size=0.3, random_state=0)

In [None]:
results3=[]

from sklearn.metrics import mean_squared_error

for name, param in model_params2.items():
    classifier=GridSearchCV(param['model'],param['params'],cv=6,return_train_score=False)
    classifier.fit(x_train3,  y_train3)
    model_performance = np.sqrt(mean_squared_error(classifier.predict(x_test3),y_test3, squared=True))
    results3.append({
        'model': name,
        'best_score': classifier.best_score_,
        'best_params': classifier.best_params_,
        'RMSE': model_performance
    })
results3

In [None]:
sns.displot(classifier.predict(x_test2))

In [None]:
sns.displot(y_test3)