In [1]:
import pandas as pd
from preprocess import getPopularityIndex
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from preprocess import cleanData, getPopularityIndex, encodePandas
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [2]:
X = pd.read_pickle('TwitterData_1e5_90rm_MFW.pkl')
y = getPopularityIndex(X, 4)
X = cleanData(X)
print('X dataframe has {} rows...'.format(len(X)))
numerical_columns = X._get_numeric_data().columns
categorical_columns = list(set(X.columns) - set(numerical_columns))
print('...with numerical columns:')
print(numerical_columns)
print('and categorical columns:')
print(categorical_columns)

print('\n')

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# print('Trainig set contains {} samples'.format(len(X_train)))
# print('Test set contains {} samples'.format(len(X_test)))

X dataframe has 20467 rows...
...with numerical columns:
Index(['followers_count', 'friends_count', 'listed_count', 'statuses_count',
       'mentions_tot_followers', 'mentions_tot_friends',
       'mentions_tot_statuses', 'n_sentences', 'n_words', 'n_adjectives',
       'n_adverbs', 'n_nouns', 'n_pronouns', 'n_verbs', 'n_long_words',
       'n_hashtags', 'n_user_mentions'],
      dtype='object')
and categorical columns:
['day', 'has_urls', 'has_questions', 'has_know', 'has_amp', 'has_sorry', 'has_please', 'has_thanks', 'has_photo', 'has_hi', 'time', 'has_see', 'has_like', 'has_symbols', 'has_gif', 'is_reply', 'has_us', 'has_video', 'has_team']




In [3]:
categorical_preprocessor = OneHotEncoder(handle_unknown='ignore', drop='first')
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)
    ])

In [5]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

feature_selection = SelectFromModel(RandomForestClassifier(), threshold = 0.01)

In [74]:
hyppar_space = dict()
hyppar_space['svc__C'] = [1, 50, 100, 1000, 2000] #[1, 5, 10, 50, 100, 500, 1000]
hyppar_space['svc__gamma'] = [0.1, 0.01, 0.001] #['auto', 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]
# hyppar_space['svc__degree'] = [2, 3, 4] #[1, 2, 3, 4]
hyppar_space['svc__kernel'] = ['rbf', 'poly', 'linear', 'sigmoid'] #['rbf', 'poly', 'linear', 'sigmoid']

In [12]:
cv_inner= KFold(n_splits=2, shuffle=True, random_state=1)
cv_outer = KFold(n_splits=5, shuffle=True, random_state=1)

model = make_pipeline(preprocessor, feature_selection, SVC())

search = GridSearchCV(model, hyppar_space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True)

scores = cross_validate(search, X, y, scoring='accuracy', cv=cv_outer, n_jobs=-1, error_score='raise', return_estimator=True)


KeyboardInterrupt: 

In [8]:
scores

array([0.59819248, 0.57938447, 0.58392377, 0.59223064, 0.59833863])

In [71]:
def nestedCV(features, dependent_y, classifier, space, preprocessor, k_inner=3, k_outer=10, n_jobs_gridsearch=1, scoring='accuracy'):

    cv_inner= KFold(n_splits=k_inner, shuffle=True, random_state=1)
    cv_outer = KFold(n_splits=k_outer, shuffle=True, random_state=1)

    feature_selection = SelectFromModel(RandomForestClassifier(), threshold = 0.01)
    model_inner = make_pipeline(feature_selection, classifier())

    search = GridSearchCV(model_inner, space, scoring=scoring, n_jobs=n_jobs_gridsearch, cv=cv_inner, refit=True)
    model = make_pipeline(preprocessor, search)

    scores = cross_validate(model, X, y, scoring=scoring, cv=cv_outer, n_jobs=-1, return_estimator=True, error_score='raise')

    return scores

In [75]:
scores = nestedCV(X, y, SVC, hyppar_space, preprocessor, k_inner=2, k_outer=2)

In [None]:
metrics = (scores['test_score'], scores['estimators'])
for acc, estim in metrics:
    print('accuracy %.3f' % (acc))
    print('estimator params ', estim.best_params_)

In [76]:
accuracy = scores['test_score']
for score in accuracy:
    print(score)

0.5908735587258159
0.5886836704778657


In [52]:
estimators = scores['estimator']

In [53]:
e_list = []
for estimator in estimators:
    e_list += [estimator]

In [77]:
e1 = e_list[0].best_params_

In [78]:
e1

{'svc__C': 50, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}