In [None]:
import pandas as pd
import numpy as np
import requests

from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, RFE, SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor


import dalex as dx

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None, "display.width", 1000)

### Wczytanie danych

In [None]:
r = requests.get('https://api.apispreadsheets.com/api/dataset/school-grades/')
data = r.json()
df = pd.DataFrame(data['data'])

df.head()

## Podejście 1. - klasyfikacja

#### Dobór zmiennych i ich przetworzenie

In [None]:
df = pd.DataFrame(data['data'])

df['pass']= np.where(df['G3']<10, 0, 1)

### Stworzenie nowych kolumn korzystając z dostępnych danych
df['Pedu'] = df['Fedu'] + df['Medu']
df["genrel"] = df["sex"]+df["romantic"]
df["Alc"] = (df["Dalc"]+df["Walc"]) / 10
df[["absenc"]] = np.where(df['absences']<8, 0, 1)
fail = pd.DataFrame([(1 if a > 0 else 0) for a in df['failures']], columns=["fail"])
df = df.join(fail)

### Przeskalowanie kolumn 
df[["Pedu"]]  = df[["Pedu"]] /df['Pedu'].max()
df[["studytime"]]  = df[["studytime"]] /df['studytime'].max()
df[["age"]] = df[["age"]] /df['age'].max()
df[["health"]] = df[["health"]]/df['health'].max()
df[["goout"]] = df[["goout"]]/df['goout'].max()
df[["freetime"]]  = df[["freetime"]] /df['freetime'].max()
df[["Dalc"]] = df[["Dalc"]]/df['Dalc'].max()
df[["absences"]] = df[["absences"]]/df['absences'].max()

In [None]:
cat_features = ["Mjob", "higher", "genrel", "address", "reason", "school", 'internet']
num_features = ["Pedu", "studytime", "goout", "age", "fail"]

In [None]:
features = num_features + cat_features
X = df.drop(["pass"], axis=1)[features]
y = df["pass"]

In [None]:
# Preprocess numerical feats:
num_transformer = SimpleImputer(strategy="constant")

# Preprocessing for categorical features:
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical features:
preprocessor = ColumnTransformer(transformers=[("num", num_transformer, num_features),
                                               ("cat", cat_transformer, cat_features)],
                                remainder = 'passthrough')

In [None]:
rf_model_enh = RandomForestClassifier(n_estimators=10,
                               max_features=0.4,
                               min_samples_split=2,
                               n_jobs=-1,
                               random_state=33)

model_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', rf_model_enh)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, random_state=42)

model_pipe.fit(X_train, y_train)

In [None]:
y_predict = model_pipe.predict(X_test)
accuracy_score(y_test, y_predict)

## Podejście 2. - regresja

### Feature engineering 

In [None]:
enc = OneHotEncoder(drop="if_binary", sparse=False)

alldf = enc.fit_transform(df.iloc[:, [0,1,3,4,5,8,9,10,11,15,16,17,18,19,20,21,22]])
non_encoded = df.iloc[:, [2,14,29]]
scaled = df.iloc[:, [6,7,12,13,23,24,25,26,27,28]]/5 

non_encoded.iloc[:,0] = non_encoded.iloc[:,0]/22
non_encoded.iloc[:,1] = non_encoded.iloc[:,1]/max(non_encoded.iloc[:,1])
non_encoded.iloc[:,2] = non_encoded.iloc[:,2]/max(non_encoded.iloc[:,2])



X_all = np.append(alldf, non_encoded, axis=1)
X_all = np.append(X_all, scaled, axis=1)

encoded_names = enc.get_feature_names(input_features= df.iloc[:, [0,1,3,4,5,8,9,10,11,15,16,17,18,19,20,21,22]].columns)
enc_names_list = encoded_names.tolist() + non_encoded.columns.tolist()+ scaled.columns.tolist()
enc_names_list

X_all = pd.DataFrame(X_all, columns=enc_names_list)
y = df[["G3"]]

X_all

#### Dobór zmiennych korzystając z gotowych funkcji

In [None]:
def feature_names(selector):
    return np.array(pf.get_feature_names(X_train.columns))[selector.get_support()]

#### Podział na zbiór testowy i treningowy

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X_all, y, test_size = 0.1, random_state=42)

#### Wyznaczenie baseline'u

In [None]:
mn = np.mean(y_train)
baseline = [mn for i in range(len(y_test))]

np.sqrt(mean_squared_error(y_test, baseline))

##### Polynomial features

In [None]:
pf = PolynomialFeatures(degree=2)
X_features = pf.fit_transform(X_train)
X_test = pf.fit_transform(X_test)

###### chi selector

In [None]:
chi2_selector = SelectKBest(chi2, k=12)
chi2_selector.fit_transform(X_features, y_train)
feature_names(chi2_selector)

###### mi selector

In [None]:
mi_selector = SelectKBest(mutual_info_classif, k=12)
mi_selector.fit(X_features, y_train)
feature_names(mi_selector)

###### rfe selector

In [None]:
# estimator = LogisticRegression(max_iter=2000)
# rfe_selector = RFE(estimator, n_features_to_select=10, step=1)
# rfe_selector = rfe_selector.fit(X_features, y_train) 
# feature_names(rfe_selector)

#### L1-based feature selection

In [None]:
def selection(num_features):
    r = 0.1
    l = 0.000000000001
    c = (l+r)/2
    while True:
        model_selector = SelectFromModel(
            LogisticRegression(penalty="l1", C=c, solver="liblinear", random_state=42)
        )
        model_selector.fit_transform(X_features, y_train)
        feat = len(feature_names(model_selector))
        if feat > num_features:
            r = c
            c = (r+l)/2
        elif feat < num_features:
            l = c
            c = (l+r)/2
        else:
            break
        print("Currently on ", len(feature_names(model_selector)), " features.")
    print("Selected ", len(feature_names(model_selector)), " features.")
    return(model_selector)
            
                
mod = selection(12)
feature_names(mod)

#### Wytrenowanie 4 wybranych modeli na automatycznie przygotowanych danych

In [None]:
X_chi2 = X_features[:, chi2_selector.get_support()]
X_mi = X_features[:, mi_selector.get_support()]
# X_rfe = X_features[:, rfe_selector.get_support()]
X_rfe = X_features[:, [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False,  True,  True, False, False,False, False, False, False, False,  True, False, False, False,False, False, False,  True, False, False, False, False, False,False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False,  True, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False,  True, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False,  True, False, False, True, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False,  True, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False]]
X_msel = X_features[:, mod.get_support()]

X_chi2_t = X_test[:, chi2_selector.get_support()]
X_mi_t = X_test[:, mi_selector.get_support()]
X_rfe_t = X_test[:, [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False,  True,  True, False, False,False, False, False, False, False,  True, False, False, False,False, False, False,  True, False, False, False, False, False,False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False,  True, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False,  True, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False,  True, False, False, True, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False,  True, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False,False, False, False, False, False, False, False, False, False]]
# X_rfe_t = X_test[:, rfe_selector.get_support()]
X_msel_t = X_test[:, mod.get_support()]

X_list = [X_chi2, X_mi, X_rfe, X_msel]
X_list_t = [X_chi2_t, X_mi_t, X_rfe_t, X_msel_t]

res = []

for i in range(4):
    lr = LogisticRegression(max_iter=1000)
    sv = SVR(C=1.5)
    rf = RandomForestRegressor(n_estimators=20, max_features=0.5, min_samples_split=3, n_jobs=-1, random_state=0)
    gb = GradientBoostingRegressor(learning_rate=0.045, n_estimators=100, criterion='mse', random_state=0)

    lr.fit(X_list[i], y_train)
    lr_pred = lr.predict(X_list_t[i])
    lr_err = np.sqrt(mean_squared_error(y_test, lr_pred))
    
    sv.fit(X_list[i], y_train)
    sv_pred = sv.predict(X_list_t[i])
    sv_err = np.sqrt(mean_squared_error(y_test, sv_pred))
    
    rf.fit(X_list[i], y_train)
    rf_pred = rf.predict(X_list_t[i])
    rf_err = np.sqrt(mean_squared_error(y_test, rf_pred))
    
    gb.fit(X_list[i], y_train)
    gb_pred = gb.predict(X_list_t[i])
    gb_err = np.sqrt(mean_squared_error(y_test, gb_pred))
    
    temp_res = [lr_err, sv_err, rf_err, gb_err]
    res.append(temp_res)



### Trenowanie modeli na wybranych i przetworzonych przez nas zmiennych

In [None]:
df = pd.DataFrame(data['data'])

### Stworzenie nowych kolumn korzystając z dostępnych danych
df['Pedu'] = df['Fedu'] + df['Medu']
df["genrel"] = df["sex"]+df["romantic"]
df["Alc"] = (df["Dalc"]+df["Walc"]) / 10
df[["absenc"]] = np.where(df['absences']<8, 0, 1)
fail = pd.DataFrame([(1 if a > 0 else 0) for a in df['failures']], columns=["fail"])
df = df.join(fail)

### Przeskalowanie kolumn 
df[["Pedu"]]  = df[["Pedu"]] /df['Pedu'].max()
df[["studytime"]]  = df[["studytime"]] /df['studytime'].max()
df[["age"]] = df[["age"]] /df['age'].max()
df[["health"]] = df[["health"]]/df['health'].max()
df[["goout"]] = df[["goout"]]/df['goout'].max()
df[["freetime"]]  = df[["freetime"]] /df['freetime'].max()
df[["Dalc"]] = df[["Dalc"]]/df['Dalc'].max()
df[["absences"]] = df[["absences"]]/df['absences'].max()

##### Dobór kategorycznych i numerycznych zmiennych

In [None]:
cat_features = ["Mjob", "higher", "genrel", "address", "reason", "school", 'internet']
num_features = ["Pedu", "studytime", "goout", "age", "fail"]

In [None]:
features = num_features + cat_features
X = df.drop(["G3"], axis=1)[features]
y = df["G3"]

In [None]:
# Preprocesssing numerycznych zmiennych:
num_transformer = SimpleImputer(strategy="constant")

# Preprocesssing kategorycznych zmiennych:
cat_transformer = Pipeline(steps=[
                                  ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
                                  ("onehot", OneHotEncoder(handle_unknown='ignore'))
                                 ]
                          )

preprocessor = ColumnTransformer(transformers=[("num", num_transformer, num_features),
                                               ("cat", cat_transformer, cat_features)],
                                 remainder = 'passthrough')

#### Wytrenowanie wybrancyh modeli z dobranymi ręcznie hiperparametrami

In [None]:
gb = GradientBoostingRegressor(learning_rate=0.045, n_estimators=100, criterion='mse', random_state=0)
rf = RandomForestRegressor(n_estimators=20, max_features=0.5, min_samples_split=3, n_jobs=-1, random_state=0)
lr = LogisticRegression(max_iter=1000)
svr = SVR(C=1.5)

model_pipe_gb = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', gb)])
model_pipe_rf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', rf)])
model_pipe_lr = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', lr)])
model_pipe_svr = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('model', svr)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.1, random_state=42)

model_pipe_gb.fit(X_train, y_train)
model_pipe_rf.fit(X_train, y_train)
model_pipe_lr.fit(X_train, y_train)
model_pipe_svr.fit(X_train, y_train)

#### Sprawdzenie wyników i porównanie modeli

In [None]:
y_predict_gb = model_pipe_gb.predict(X_test)
y_predict_rf = model_pipe_rf.predict(X_test)
y_predict_lr = model_pipe_lr.predict(X_test)
y_predict_svr = model_pipe_svr.predict(X_test)

res.append([
    np.sqrt(mean_squared_error(y_test, y_predict_lr)),
    np.sqrt(mean_squared_error(y_test, y_predict_svr)),
    np.sqrt(mean_squared_error(y_test, y_predict_rf)),
    np.sqrt(mean_squared_error(y_test, y_predict_gb))
])
    
#     )], columns=["Logistic Regression", "SVR", "Random Forest","Gradient Boosting"])
    
results = pd.DataFrame(res, columns=["Logistic Regression", "SVR", "Random Forest","Gradient Boosting"], index=["SelectKBest (chi2)","SelectKBest (mutual information)", "RFE", "L1 Based Model Selection", "Hand-prepared features"])


In [None]:
results

#### Analiza najlepszego modelu - Gradient Boosting

In [None]:
explainer = dx.Explainer(model_pipe_gb,X,y)

###### Zbadanie wpływu zmiennych na predykcję dla dwóch losowych rekordów - przeciętnego i słabego wyniku

In [None]:
y[10]

In [None]:
explainer.predict_parts(X.loc[[10],:]).plot()

In [None]:
y[432]

In [None]:
explainer.predict_parts(X.loc[[432],:]).plot()