In [56]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import datetime

from sklearn.ensemble import StackingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


In [57]:
def get_dataset():
    X_train = pd.read_csv('clean_data/X_train.csv')
    y_train = pd.read_csv('clean_data/y_train.csv')
    X_test = pd.read_csv('clean_data/X_test.csv')
    return (X_train, y_train, X_test)

In [58]:
def get_stacking():
    level0 = []
    level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('cart', DecisionTreeClassifier()))
    level0.append(('svm', SVC()))
    level0.append(('bayes', GaussianNB()))
    
    level1 = LogisticRegression()
    
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv = 5)
    return model

In [59]:
def get_models():
    models = {
#         'lr' : LogisticRegression(),
#         'knn' : KNeighborsClassifier(),
#         'cart' : DecisionTreeClassifier(),
#         'svm' : SVC(),
#         'bayes' : GaussianNB(),
#         'stacking' : get_stacking(),
        'rfc' : RandomForestClassifier()
        
    }
    return models

In [60]:
def get_scores(model):
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
    data = get_dataset()
    X, y = data[0], data[1]
    
    scores = cross_val_score(model, X, y, scoring='accuracy', cv = cv, n_jobs=-1, error_score='raise')
    model.fit(X, y)
    preds = model.predict(data[2])
    return (np.mean(scores), preds)
    

In [61]:
def make_sub(name, score, answers):
    now = datetime.datetime.now()
    file_name = name + '_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    test = get_dataset()
    test = test[2]
    out = pd.DataFrame(
        {
            'Id' : test['Id'].values,
            'Answer' : answers
        }
    )
    out.to_csv(file_name, index = False)
    print('File ', file_name, ' created in output/')

In [62]:
scores = {}

models = get_models()
for name, model in models.items():
    print('Working on {}'.format(name))
    scores[name] = get_scores(model)
    make_sub(name, scores[name][0], scores[name][1])
print(scores)

Working on rfc


  model.fit(X, y)


File  rfc_0.8596148614511679_2021-02-15-20-34.csv  created in output/
{'rfc': (0.8596148614511679, array([0, 0, 0, ..., 0, 1, 0]))}
