# Ostateczne modele
## Przewidywanie przynależności partyjnej na podstawie oddanych głosów

### Wczytanie i przetworzenie danych

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('congressional_voting_dataset.csv')

d = {'y':1, 'n':-1, '?':0, 'democrat': 0, 'republican':1}
for i in df.columns:
    df[i] = df[i].map(d)
df.rename(columns={'political_party': 'is_republican'}, inplace=True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df[df.columns[:16]], df['is_republican'], test_size=0.3, random_state=42)
df

Unnamed: 0,handicapped_infants,water_project_cost_sharing,adoption_of_the_budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_to_nicaraguan_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_administration_act_south_africa,is_republican
0,-1,1,-1,1,1,1,-1,-1,-1,1,0,1,1,1,-1,1,1
1,-1,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,1,-1,0,1
2,0,1,1,0,1,1,-1,-1,-1,-1,1,-1,1,1,-1,-1,0
3,-1,1,1,-1,0,1,-1,-1,-1,-1,1,-1,1,-1,-1,1,0
4,1,1,1,-1,1,1,-1,-1,-1,-1,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,-1,-1,1,1,1,1,-1,-1,1,1,-1,1,1,1,-1,1,1
431,-1,-1,1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1,-1,1,0
432,-1,0,-1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,1,1
433,-1,-1,-1,1,1,1,0,0,0,0,-1,1,1,1,-1,1,1


### Wybór modeli

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier

#### Przygotowanie tablicy do zapisywania wyników.

In [3]:
indexes = ["roc_auc", "f1", "accuracy", "precision", "recall"]
scores = pd.DataFrame(0, index=indexes, columns=[])

#### Podstawowe modele

In [4]:
models = [
    DecisionTreeClassifier(random_state=1, max_depth=3),
    KNeighborsClassifier(),
    LogisticRegression(random_state=1, max_iter=1000),
    RandomForestClassifier(max_depth=6, min_samples_split = 2, max_features = 3, random_state=1, n_jobs = -1),
    AdaBoostClassifier(random_state=1),
    GradientBoostingClassifier(random_state=1, learning_rate=0.01),
    XGBClassifier(random_state=1, learning_rate=0.01, booster='gbtree', max_depth=4, eval_metric="logloss", use_label_encoder=False)
]

In [5]:
from sklearn.model_selection import cross_val_score
m = ["roc_auc", "f1", "accuracy", "precision", "recall"]
for i in m:
    for model in models:
        scr = cross_val_score(model, X_train, y_train, scoring=i, cv=10).mean()
        scores.at[i,type(model).__name__] = scr

#### Modele wykorzystujące BaggingClassifier

In [6]:
baggingmodels = [ 
    BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3), random_state=1, n_jobs = -1),
    BaggingClassifier(base_estimator=KNeighborsClassifier(), random_state=1, n_jobs = -1),
    BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000), random_state=1, n_jobs = -1),
    BaggingClassifier(base_estimator=AdaBoostClassifier(), random_state=1, n_jobs = -1)
]

In [7]:
base_models = ['(DecisionTree)', '(KNeighbors)', '(LogisticRegression)', '(AdaBoost)']
for i in m:
    j = 0
    for model in baggingmodels:
        scr = cross_val_score(model, X_train, y_train, scoring=i, cv=10).mean()
        scores.at[i,type(model).__name__ + base_models[j]] = scr
        j += 1

#### Porównanie wyników pierwszych modeli

In [8]:
scores_table = scores.transpose()
scores_table['mean'] = scores_table.mean(axis=1)
table1 = scores_table.sort_values("mean", ascending=False)
table1

Unnamed: 0,roc_auc,f1,accuracy,precision,recall,mean
RandomForestClassifier,0.985877,0.951561,0.96043,0.947802,0.958333,0.960801
BaggingClassifier(AdaBoost),0.993807,0.946481,0.956989,0.954945,0.942308,0.958906
LogisticRegression,0.990936,0.946845,0.956989,0.947308,0.95,0.958416
AdaBoostClassifier,0.988347,0.946842,0.957097,0.94707,0.95,0.957871
BaggingClassifier(LogisticRegression),0.992763,0.943823,0.953763,0.933114,0.958333,0.956359
BaggingClassifier(KNeighbors),0.97723,0.930241,0.940538,0.89304,0.975,0.94321
BaggingClassifier(DecisionTree),0.979602,0.924783,0.937312,0.911777,0.942308,0.939156
KNeighborsClassifier,0.967836,0.926342,0.937204,0.891832,0.966667,0.937976
XGBClassifier,0.982807,0.920062,0.934194,0.919927,0.926282,0.936654
GradientBoostingClassifier,0.977924,0.918805,0.934194,0.924231,0.917308,0.934492


Pięć najlepszych modeli zostało wybranych do dalszej pracy.

#### Modele VotingClassifier oraz StackingClassifier

In [9]:
votingmodels = [
    ('RandomForestClassifier', RandomForestClassifier(max_depth=6, min_samples_split = 2, max_features = 3, random_state=1, n_jobs = -1)),
    ('BaggingClassifier(AdaBoost)', BaggingClassifier(base_estimator=AdaBoostClassifier(), random_state=1, n_jobs = -1)),
    ('LogisticRegression', LogisticRegression(random_state=1, max_iter=1000)),
    ('AdaBoostClassifier', AdaBoostClassifier(random_state=1)),
    ('BaggingClassifier(LogisticRegression)', BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000), random_state=1, n_jobs = -1))
]

In [10]:
model_soft = VotingClassifier(estimators=votingmodels, voting='soft', n_jobs=-1)
for i in m:
    scr = cross_val_score(model_soft, X_train, y_train, scoring=i, cv=10).mean()
    scores.at[i,type(model_soft).__name__] = scr

In [11]:
stacking = StackingClassifier(estimators=votingmodels, final_estimator=LogisticRegression(), n_jobs=-1)
for i in m:
    scr = cross_val_score(stacking, X_train, y_train, scoring=i, cv=10).mean()
    scores.at[i,type(stacking).__name__] = scr

#### Kolejne porównanie wyników

In [12]:
scores_table = scores.transpose()
scores_table['mean'] = scores_table.mean(axis=1)
table2 = scores_table.sort_values("mean", ascending=False)
table2

Unnamed: 0,roc_auc,f1,accuracy,precision,recall,mean
VotingClassifier,0.992714,0.951193,0.960215,0.947308,0.958333,0.961953
RandomForestClassifier,0.985877,0.951561,0.96043,0.947802,0.958333,0.960801
BaggingClassifier(AdaBoost),0.993807,0.946481,0.956989,0.954945,0.942308,0.958906
StackingClassifier,0.992714,0.94686,0.956882,0.946667,0.95,0.958625
LogisticRegression,0.990936,0.946845,0.956989,0.947308,0.95,0.958416
AdaBoostClassifier,0.988347,0.946842,0.957097,0.94707,0.95,0.957871
BaggingClassifier(LogisticRegression),0.992763,0.943823,0.953763,0.933114,0.958333,0.956359
BaggingClassifier(KNeighbors),0.97723,0.930241,0.940538,0.89304,0.975,0.94321
BaggingClassifier(DecisionTree),0.979602,0.924783,0.937312,0.911777,0.942308,0.939156
KNeighborsClassifier,0.967836,0.926342,0.937204,0.891832,0.966667,0.937976


### Strojenie hiperparametrów

In [13]:
from sklearn.model_selection import GridSearchCV

#### Znalezienie hiperparametrów z użyciem GridSearch

In [14]:
tuned_model = RandomForestClassifier(max_depth=6, min_samples_split = 2, max_features = 3, random_state=1, n_jobs = -1)
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, 8],
    'min_samples_split': [2, 4, 6, 8],
    'max_features': ['auto', 'sqrt', 3, 6, 9]
}
clf_grid = GridSearchCV(tuned_model, parameters, cv=5, n_jobs=-1)
clf_grid.fit(X_train, y_train)
results = pd.DataFrame(clf_grid.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

print(results.head(1))
print()
print(results.head(1).params.item())

                                                params  mean_test_score
120  {'criterion': 'entropy', 'max_depth': 6, 'max_...         0.960601

{'criterion': 'entropy', 'max_depth': 6, 'max_features': 'auto', 'min_samples_split': 2}


In [15]:
tuned_model = BaggingClassifier(base_estimator=AdaBoostClassifier(), random_state=1, n_jobs = -1)
parameters = {
    'n_estimators': [10, 50, 100],
    'max_samples': [1.0, 0.5, 0.1],
    'max_features': [1.0, 0.5, 0.1]
}
clf_grid = GridSearchCV(tuned_model, parameters, cv=5, n_jobs=-1)
clf_grid.fit(X_train, y_train)
results = pd.DataFrame(clf_grid.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

print(results.head(1))
print()
print(results.head(1).params.item())

                                              params  mean_test_score
4  {'max_features': 1.0, 'max_samples': 0.5, 'n_e...         0.957322

{'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 50}


In [16]:
tuned_model = LogisticRegression(random_state=1, max_iter=1000)
parameters = {
    'tol': [1e-3, 1e-4, 1e-6],
    'C': [0.01, 0.1, 1.0, 10.0, 100.0]
}
clf_grid = GridSearchCV(tuned_model, parameters, cv=5, n_jobs=-1)
clf_grid.fit(X_train, y_train)
results = pd.DataFrame(clf_grid.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

print(results.head(1))
print()
print(results.head(1).params.item())

                     params  mean_test_score
6  {'C': 1.0, 'tol': 0.001}         0.960601

{'C': 1.0, 'tol': 0.001}


In [17]:
tuned_model = AdaBoostClassifier(random_state=1)
parameters = {
    'n_estimators': [10, 50, 100, 500],
    'base_estimator': [DecisionTreeClassifier(max_depth=1), LogisticRegression()],
    'learning_rate': [0.001, 0.01, 0.1, 1.0]
}
clf_grid = GridSearchCV(tuned_model, parameters, cv=5, n_jobs=-1)
clf_grid.fit(X_train, y_train)
results = pd.DataFrame(clf_grid.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

print(results.head(1))
print()
print(results.head(1).params.item())

                                               params  mean_test_score
11  {'base_estimator': DecisionTreeClassifier(max_...         0.960601

{'base_estimator': DecisionTreeClassifier(max_depth=1), 'learning_rate': 0.1, 'n_estimators': 500}


In [18]:
tuned_model = BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000), random_state=1, n_jobs = -1)
parameters = {
    'n_estimators': [10, 50, 100],
    'max_samples': [1.0, 0.5, 0.1],
    'max_features': [1.0, 0.5, 0.1]
}
clf_grid = GridSearchCV(tuned_model, parameters, cv=5, n_jobs=-1)
clf_grid.fit(X_train, y_train)
results = pd.DataFrame(clf_grid.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

print(results.head(1))
print()
print(results.head(1).params.item())

                                              params  mean_test_score
0  {'max_features': 1.0, 'max_samples': 1.0, 'n_e...         0.960601

{'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10}


#### Utworzenie i przetestowanie modeli z nowymi hiperparametrami

In [19]:
tunedmodels = [
    ('tunedRandomForestClassifier', RandomForestClassifier(criterion='entropy', max_depth=6, max_features = 'auto', min_samples_split = 2, random_state=1, n_jobs = -1)),
    ('tunedBaggingClassifier(AdaBoost)', BaggingClassifier(base_estimator=AdaBoostClassifier(), max_features=1.0, max_samples=0.5, n_estimators=50, random_state=1, n_jobs = -1)),
    ('tunedLogisticRegression', LogisticRegression(C=1.0, tol=0.001, random_state=1, max_iter=1000)),
    ('tunedAdaBoostClassifier', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), learning_rate=0.1, n_estimators=500, random_state=1)),
    ('tunedBaggingClassifier(LogisticRegression)', BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000), max_features=1.0, max_samples=1.0, n_estimators=10, random_state=1, n_jobs = -1))
]

In [21]:
for i in m:
    for name,model in tunedmodels:
        scr = cross_val_score(model, X_train, y_train, scoring=i, cv=10).mean()
        scores.at[i,name] = scr

#### Ponowne sprawdzenie modeli VotingClassifier i StackingClassifier

In [22]:
final_model_soft = VotingClassifier(estimators=tunedmodels, voting='soft', n_jobs=-1)
for i in m:
    scr = cross_val_score(final_model_soft, X_train, y_train, scoring=i, cv=10).mean()
    scores.at[i,'tuned' + type(final_model_soft).__name__] = scr

In [23]:
final_stacking = StackingClassifier(estimators=tunedmodels, final_estimator=LogisticRegression(), n_jobs=-1)
for i in m:
    scr = cross_val_score(final_stacking, X_train, y_train, scoring=i, cv=10).mean()
    scores.at[i,'tuned' + type(final_stacking).__name__] = scr

#### Porównanie modeli przed i po strojeniu hiperparametrów

In [24]:
scores_table = scores.transpose()
scores_table['mean'] = scores_table.mean(axis=1)
table3 = scores_table.sort_values("mean", ascending=False)
table3

Unnamed: 0,roc_auc,f1,accuracy,precision,recall,mean
VotingClassifier,0.992714,0.951193,0.960215,0.947308,0.958333,0.961953
RandomForestClassifier,0.985877,0.951561,0.96043,0.947802,0.958333,0.960801
BaggingClassifier(AdaBoost),0.993807,0.946481,0.956989,0.954945,0.942308,0.958906
StackingClassifier,0.992714,0.94686,0.956882,0.946667,0.95,0.958625
LogisticRegression,0.990936,0.946845,0.956989,0.947308,0.95,0.958416
tunedLogisticRegression,0.990936,0.946845,0.956989,0.947308,0.95,0.958416
AdaBoostClassifier,0.988347,0.946842,0.957097,0.94707,0.95,0.957871
tunedAdaBoostClassifier,0.987777,0.946198,0.956989,0.954212,0.941667,0.957369
tunedBaggingClassifier(LogisticRegression),0.992763,0.943823,0.953763,0.933114,0.958333,0.956359
BaggingClassifier(LogisticRegression),0.992763,0.943823,0.953763,0.933114,0.958333,0.956359


Strojenie hiperparametrów nie wpłynęło pozytywnie na żaden z modeli.

### Wybór pięciu ostatecznych modeli

In [25]:
finalmodels = [
    ('VotingClassifier', VotingClassifier(estimators=votingmodels, voting='soft', n_jobs=-1)),
    ('RandomForestClassifier', RandomForestClassifier(max_depth=6, min_samples_split = 2, max_features = 3, random_state=1, n_jobs = -1)),
    ('BaggingClassifier(AdaBoost)', BaggingClassifier(base_estimator=AdaBoostClassifier(), random_state=1, n_jobs = -1)),
    ('StackingClassifier', StackingClassifier(estimators=votingmodels, final_estimator=LogisticRegression(), n_jobs=-1)),
    ('LogisticRegression', LogisticRegression(random_state=1, max_iter=1000))
]

### Sprawdzenie wyników na zbiorze testowym

In [26]:
finalscores = pd.DataFrame(0, index=['score'], columns=[])

for name,model in finalmodels:
    model.fit(X_train, y_train)
    finalscores.at['score', name] = model.score(X_test, y_test)

In [27]:
scores_table = finalscores.transpose()
table4 = scores_table.sort_values("score", ascending=False)
table4

Unnamed: 0,score
VotingClassifier,0.984733
StackingClassifier,0.984733
LogisticRegression,0.984733
RandomForestClassifier,0.961832
BaggingClassifier(AdaBoost),0.961832
