# Kamień milowy 3 (Finalne modele)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
congressional_voting_df = pd.read_csv("congressional_voting_dataset.csv")

In [3]:
# zmieniamy kodowanie zmiennych
# teraz traktujemy "?" jako wsztrzymania się od głosu, a nie brak danych
map = {"y": 1, "n" : -1, "?": 0, "democrat" : 1, "republican" : 0}
# map = {"y": 2, "n" : 1, "?": 0, "democrat" : 1, "republican" : 0} # przy takim kodowaniu wyniki są gorsze

columns = congressional_voting_df.columns.to_list()
for column in columns:
    congressional_voting_df[column] = congressional_voting_df[column].map(map)

congressional_voting_df.head(5)

Unnamed: 0,handicapped_infants,water_project_cost_sharing,adoption_of_the_budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_to_nicaraguan_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_administration_act_south_africa,political_party
0,-1,1,-1,1,1,1,-1,-1,-1,1,0,1,1,1,-1,1,0
1,-1,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,1,-1,0,0
2,0,1,1,0,1,1,-1,-1,-1,-1,1,-1,1,1,-1,-1,1
3,-1,1,1,-1,0,1,-1,-1,-1,-1,1,-1,1,-1,-1,1,1
4,1,1,1,-1,1,1,-1,-1,-1,-1,1,0,1,1,1,1,1


## Train-test split

In [4]:
# z kamienia 2 pamiętamy, że target jest zrównoważony
X_train, X_test, y_train, y_test = train_test_split(congressional_voting_df.drop("political_party", axis=1), 
                                                    congressional_voting_df["political_party"], 
                                                    test_size=0.3, shuffle=True, random_state=42)

## Modeling(all variables)

In [5]:
models_dict = {
    "rfc": 0,
    "xgb": 1, # eXtreme gradient boosting
    "gbc": 2, # gradient boosting
    "lr": 3
}

params = [
    {
        'bootstrap': [True, False],
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
    },
    
    {
        'max_depth': [6, 10, 15, 20],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
        'gamma': [0, 0.25, 0.5, 1.0],
        'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
        'n_estimators': [100]},
    
    {
        'n_estimators': [100, 300, 500, 700, 900, 1200, 1500, 1700, 2000],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
        'max_depth': [4, 6, 8, 10]
        
    },
    
    {
        'penalty' : ['l1', 'l2'],
        'C' : np.logspace(-4, 4, 20),
        'solver' : ['liblinear', 'saga']}
]

models = [RandomForestClassifier(random_state=42),
          XGBClassifier(use_label_encoder=False),
          GradientBoostingClassifier(random_state=0),
          LogisticRegression(random_state=0)
         ]

res = []

In [6]:
def get_scores(clf, X_test, y_test, model_type):
    return pd.DataFrame({
                            "clf": [model_type],
                            "accuracy": [clf.score(X_test, y_test)], 
                            "auc": [roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])]
                        })

In [7]:
def modeling(X_train, y_train, X_test, y_test, model_type):
    if model_type == "baseline":
        lr = LogisticRegression(random_state=0)
        lr.fit(X_train, y_train)
        return (lr, get_scores(lr, X_test, y_test, model_type))
    
    clf = models[models_dict[model_type]]
    parameter_set = params[models_dict[model_type]]
    
    random_search = RandomizedSearchCV(clf, parameter_set, random_state=0, n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    clf = random_search.best_estimator_
    clf.fit(X_train, y_train)
    
    return (clf, get_scores(clf, X_test, y_test, model_type))

In [8]:
def merge_res_to_df(res):
    res_merged = None

    for i in range(len(res)):
        res_merged = pd.concat([res_merged, res[i]], axis=0).reset_index(drop=True)
        
    return res_merged

### Baseline

In [9]:
baseline, baseline_scores = modeling(X_train, y_train, X_test, y_test, "baseline")
res.append(baseline_scores)
baseline_scores

Unnamed: 0,clf,accuracy,auc
0,baseline,0.984733,0.996164


### Random forest

In [10]:
rfc, rfc_scores = modeling(X_train, y_train, X_test, y_test, "rfc")
res.append(rfc_scores)
rfc_scores # gorzej niż baseline...

Unnamed: 0,clf,accuracy,auc
0,rfc,0.977099,0.995908


### XGBoost

In [11]:
xgb, xgb_scores = modeling(X_train, y_train, X_test, y_test, "xgb")
res.append(xgb_scores)
xgb_scores



Unnamed: 0,clf,accuracy,auc
0,xgb,0.984733,0.99821


### Gradient boosting

In [12]:
gbc, gbc_scores = modeling(X_train, y_train, X_test, y_test, "gbc")
res.append(gbc_scores)
gbc_scores

Unnamed: 0,clf,accuracy,auc
0,gbc,0.977099,0.997442


### Logistic Regression

In [13]:
lr, lr_scores = modeling(X_train, y_train, X_test, y_test, "lr")
res.append(lr_scores)
lr_scores

Unnamed: 0,clf,accuracy,auc
0,lr,0.984733,0.99335


## Conclusion

### All variables
Ogólnie lepiej niż 0.98... się nie daje. W porównaniu do kamienia 2 wyniki się nieco polepszyły, prawdopodobnie z powodu zmiany encodingu(i zrezygnowania z imputacji `?`).

In [14]:
merge_res_to_df(res)

Unnamed: 0,clf,accuracy,auc
0,baseline,0.984733,0.996164
1,rfc,0.977099,0.995908
2,xgb,0.984733,0.99821
3,gbc,0.977099,0.997442
4,lr,0.984733,0.99335


# Wyniki bez najważniejszej zmiennej

Spróbujmy zobaczyć jakie wyniiki będą dawać modele po usunięciu najbardziej skorelowanej zminnej, czyli 'physician_fee_freeze'. 

In [15]:
X_train_droped = X_train.drop('physician_fee_freeze', axis='columns')
X_test_droped = X_test.drop('physician_fee_freeze', axis='columns')

In [16]:
res_droped = []
# Baseline
baseline, baseline_scores = modeling(X_train_droped, y_train, X_test_droped, y_test, "baseline")
res_droped.append(baseline_scores)
# Random forest
rfc, rfc_scores = modeling(X_train_droped, y_train, X_test_droped, y_test, "rfc")
res_droped.append(rfc_scores)
# XGBoost
xgb, xgb_scores = modeling(X_train_droped, y_train, X_test_droped, y_test, "xgb")
res_droped.append(xgb_scores)
# Gradient boosting
gbc, gbc_scores = modeling(X_train_droped, y_train, X_test_droped, y_test, "gbc")
res_droped.append(gbc_scores)
# Logistic Regresion
lr, lr_scores = modeling(X_train_droped, y_train, X_test_droped, y_test, "lr")
res_droped.append(lr_scores)
# Displaing results
merge_res_to_df(res_droped)



Unnamed: 0,clf,accuracy,auc
0,baseline,0.923664,0.971355
1,rfc,0.923664,0.964706
2,xgb,0.923664,0.966496
3,gbc,0.89313,0.95422
4,lr,0.923664,0.970588


Jak widać bez najważniejszej zmiennej wyniki bardzo się zmniejszyły, ponieważ aż o ok. 0.05. Najlepsze wyniki w tym momencie wynoszą 0.92. 