In [1]:
from sklearn.pipeline import Pipeline
from datautils import dataloader
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import RidgeClassifier, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import os
from stacking import StackModels

In [2]:
x, y = dataloader(size=10000)

In [3]:
np.mean(y)

0.5

In [4]:
x[0]

array([ 0.3984375 ,  0.35791016,  0.24438477, ..., -0.7792969 ,
       -0.82470703, -0.82177734], dtype=float32)

In [5]:
clf = Pipeline([('pca', PCA(n_components=60)), ('clf', SVC())])

In [6]:
clf.fit(x, y)



Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=60,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3,
                     gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [7]:
x_val, y_true = dataloader(data_path=os.path.join('data', 'val'), size=20000)
y_predicted = clf.predict(x_val)
print(accuracy_score(y_true, y_predicted))

0.79475


In [8]:
models = [RandomForestClassifier(n_estimators=300, n_jobs=-1, max_depth=3), LinearRegression(), RidgeClassifier()]
stacked_clf = Pipeline([('stacking', StackModels(models, x, y)), ('clf', SVC())])

In [9]:
stacked_clf.fit(x,y)



Pipeline(memory=None,
         steps=[('stacking',
                 StackModels(models=[RandomForestClassifier(bootstrap=True,
                                                            class_weight=None,
                                                            criterion='gini',
                                                            max_depth=3,
                                                            max_features='auto',
                                                            max_leaf_nodes=None,
                                                            min_impurity_decrease=0.0,
                                                            min_impurity_split=None,
                                                            min_samples_leaf=1,
                                                            min_samples_split=2,
                                                            min_weight_fraction_leaf=0.0,
                                                          

In [6]:
x_val, y_true = dataloader(data_path=os.path.join('data', 'val'), size=20000)
y_spredicted = stacked_clf.predict(x_val)
print(accuracy_score(y_true, y_spredicted))

0.82425


In [6]:
x_val.shape

(4000, 8000)