In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import data

from sklearn.decomposition import PCA,TruncatedSVD,NMF
from sklearn.feature_selection import (chi2, f_classif,mutual_info_classif,
                                        SelectKBest,SelectFromModel,VarianceThreshold,RFECV)
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion,TransformerMixin
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import (AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,
                              GradientBoostingClassifier,RandomForestClassifier,VotingClassifier)
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.svm import LinearSVC,SVC,NuSVC
from sklearn.naive_bayes import BernoulliNB,GaussianNB,MultinomialNB
from sklearn.linear_model import LogisticRegression,RidgeClassifier,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

Hocanın özellikle koyduğu tekrarlayan satırları düşürmeye yarıyor burası, her birinden o tekrarlayan featureları atıyor.

In [34]:
# Drop same or zero columns
X_train,y_train,X_test = data.load_data()
same_columns = []
for i in range(0,595):
    if i not in same_columns:
        counter = i + 1
        while counter < 595:
            if X_train.iloc[:,i].equals(X_train.iloc[:,counter]):
                same_columns.append(counter)
            counter += 1
same_columns.append(2)
same_columns = list(set(same_columns))
for i in same_columns:
    X_train.drop(columns='X'+str(i+1), axis=1, inplace=True)
    X_test.drop(columns='X'+str(i+1), axis=1, inplace=True)

In [42]:
class ModelTransformer(TransformerMixin):

    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict(X))

classify = FeatureUnion([
                ('gbr', ModelTransformer(GradientBoostingClassifier())),
                ('dtr', ModelTransformer(DecisionTreeClassifier())),
                ('etr', ModelTransformer(ExtraTreesClassifier())),
                ('rfr', ModelTransformer(RandomForestClassifier())),
                ('par', ModelTransformer(LGBMClassifier())),
                ('ada', ModelTransformer(AdaBoostClassifier())),
    ])
model = VotingClassifier()

(120, 496)

Buradan da gridsearch yapıyorsun en temel haliyle, bir classfier'ın birden çok parametresi var. Feature selectorlar da birden fazla, burada mesela hepsini veriyorsun  

In [45]:
pipe = Pipeline([
    ('reduce_dim',SelectKBest(chi2)),
    ('classify', SVC())
])

N_FEATURES_OPTIONS = list(range(10,100,10))#Kaç tane feature seçileceği
C_OPTIONS = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]#Classifier opsiyonları, classifier değişirse onun parametrelerine
#sklearn'den bak, her birinin farklı parametreleri var
gamma = [1,0.1,0.001,0.0001]#Classifier
param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS,
        'classify__gamma':gamma,
        'classify__kernel':['linear','rbf']
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS,
        'classify__gamma':gamma,
        'classify__kernel':['linear','rbf']
    },
    {
        'reduce_dim': [SelectKBest(f_classif)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS,
        'classify__gamma':gamma,
        'classify__kernel':['linear','rbf']
    },
    {
        'reduce_dim': [SelectKBest(mutual_info_classif)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS,
        'classify__gamma':gamma,
        'classify__kernel':['linear','rbf']
    },
]
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid, scoring="accuracy",cv=k_fold)
grid.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('reduce_dim', SelectKBest(k=10, score_func=<function chi2 at 0x7f93ed925048>)), ('classify', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid=[{'reduce_dim': [PCA(copy=True, iterated_power=7, n_components=90, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=10, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, v...0, 10.0, 100.0], 'classify__gamma': [1, 0.1, 0.001, 0.0001], 'classify__kernel': ['linear', 'rbf']}],
       pre_dispatch='2*n_jobs', refit=True

In [53]:
grid.best_params_

{'classify__C': 10.0,
 'classify__gamma': 1,
 'classify__kernel': 'linear',
 'reduce_dim': NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
   n_components=10, random_state=None, shuffle=False, solver='cd',
   tol=0.0001, verbose=0),
 'reduce_dim__n_components': 10}

In [None]:
y_pred = grid.best_estimator_.predict(X_test.loc[:,a])
data.write_output(y_pred.reshape(-1,1))

In [None]:
model = SVC(C=10, kernel='linear',gamma=1)