In [3]:
import pandas as pd
import numpy as np
from glob import glob
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

### Classificadores

In [4]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

### Métricas

In [5]:
from sklearn.metrics import mean_squared_error

## Importando Dataset

In [6]:
blog_df = pd.DataFrame(None)
for file in glob('BlogFeedback/*'):
    df = pd.read_csv(file, header=None)
    blog_df = pd.concat([blog_df, df], ignore_index=None)

In [27]:
blog_df

0    281
dtype: int64

## Separando em dados de Treino e Teste

In [8]:
X = blog_df.drop(columns=[280])
y = blog_df[280]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Criando Pipelines

In [10]:
pca = PCA(n_components=2)
def cls_pipeline(cls):
    return Pipeline(steps=[('scale', StandardScaler()),
                        ('pca', pca),
                        ('cls', cls)])


In [11]:
# comparar Regressão Linear, RandomForest, SVM e Gradient Boosting

In [37]:
pca_components = [_ for _ in range (2, len(blog_df.columns), 2)]

rl_params = {'cls__n_jobs' : [-1],
            'pca__n_components' : pca_components}

rf_params = {'cls__n_estimators' : [_ for _ in range (10, 110, 10)],
            'cls__criterion' : ['mae', 'mse'],
            'cls__max_depth' : list(np.linspace(10, 1000, 10)) + [None],
            'cls__max_features' : ['auto', 'sqrt','log2'],
            'cls__min_samples_leaf' : [_ for _ in range (1, 11)],
            'cls__min_samples_split' : [_ for _ in range (2, 22, 2)],
            'pca__n_components' : pca_components}

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
svr_params = [{'cls__C': param_range, 
               'cls__kernel': ['linear'],
              'pca__n_components' : pca_components},
              {'cls__C': param_range, 
               'cls__gamma': param_range, 
               'cls__kernel': ['rbf'],
               'pca__n_components' : pca_components}]

gbr_params = {'cls__n_estimators' : [_ for _ in range (100, 1100, 100)],
            'cls__loss' : ['ls', 'lad', 'huber', 'quantile'],
            'cls__learning_rate' : list(np.linspace(0, 0.3, 13)),
            'cls__criterion' : ['friedman_mse', 'mse', 'mae'],
            'cls__subsample' : list(np.linspace(0.25, 2, 8)),
            'cls__max_depth' : [_ for _ in range (3,11)],
            'cls__max_features' : ['auto', 'sqrt','log2'],
            'cls__min_samples_leaf' : [_ for _ in range (1, 11)],
            'cls__min_samples_split' : [_ for _ in range (2, 22, 2)],
            'pca__n_components' : pca_components}

In [39]:
pipe_svr = cls_pipeline(GradientBoostingRegressor())

gs = GridSearchCV(estimator=pipe_svr, 
                  param_grid=gbr_params, 
                  scoring='accuracy', 
                  refit=True,
                  cv=10,
                  n_jobs=-1)

In [None]:
gs.fit(X_train, y_train)

In [None]:
print(gs.best_score_)
print(gs.best_params_)