## Modelling - Support Vector Classifier

In [7]:
import pandas as pd
import pleiades as ple
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [8]:
import_path = r'..\data\#michellewilliams_users_clean.csv'
df = pd.read_csv(import_path, index_col=0)

import_path = r'..\data\replies_to_jk_rowling_users_clean.csv'
df2 = pd.read_csv(import_path, index_col=0)

In [9]:
df.head()

Unnamed: 0,name_and_description,conservative
0,Boris Is My Prime Minister ✡ ️ 🇬🇧,1.0
1,🇺🇸 🇺🇸 Red Blooded All American 🌾 Heartland bo...,1.0
2,Indian singer Army brat Muay Thai panda Horse ...,0.0
3,Christian SC Gamecocks College Football Outdoo...,1.0
4,Mother of Amma to love my kids love my grandki...,1.0


In [10]:
df = df.fillna('')
df2 = df2.fillna('')

In [11]:
X = df['name_and_description']
X_val = df2['name_and_description']
y = df['conservative'].values
y_val = df2['conservative'].values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)
sebas = ple.Sebastian()

In [14]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer(stop_words='english', token_pattern=r'([^\s]+)')),
    ('svc', SVC(gamma='scale'))
])
params = {
    'tvec__stop_words': ['english'],
    'tvec__ngram_range': [(1, 1), (1, 2)],
    'tvec__max_df': [.3, .6, .9],
    'tvec__min_df': [1, 3, 7],
    'tvec__max_features': [2000, 5000, 10000],
    'svc__C': [0.5, 1, 2],
    'svc__kernel': ['linear'],
}
gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs=-1)
gs.fit(X_train, y_train)
# best score: 0.8181384248210024
print('best score:', gs.best_score_)
# best params: svc: C=0.5, kernel='linear' tvec: max_df=0.3, max_features=2000, min_df=3, ngram_range=(1, 2), stop_words='english'
print('best params:', sebas.get_params(gs.best_params_))
print()

best score: 0.8181384248210024
best params: svc: C=0.5, kernel='linear' tvec: max_df=0.3, max_features=2000, min_df=3, ngram_range=(1, 2), stop_words='english'



In [19]:
tvec = TfidfVectorizer(max_df=0.3, max_features=2000, min_df=3, ngram_range=(1, 2), stop_words='english', token_pattern=r'([^\s]+)')
X_train = tvec.fit_transform(X_train)
X_train = pd.DataFrame(X_train.toarray(), columns=tvec.get_feature_names())
X_test = tvec.transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=tvec.get_feature_names())
X_val = tvec.transform(X_val)
X_val = pd.DataFrame(X_val.toarray(), columns=tvec.get_feature_names())

In [21]:
print('TfidfVectorizer:')
print(X_train.sum().sort_values(ascending=False)[:10])
print()

TfidfVectorizer:
️               165.660758
🇺🇸              162.587553
love            113.965834
maga            110.764999
trump           106.812452
life             80.994739
⭐                79.005003
❤                67.392424
god              64.871124
conservative     61.364692
dtype: float64



In [None]:
X_train = X_train[X_train.sum().sort_values(ascending=False).index[1:]]

In [None]:
X_cvec = X_cvec[X_cvec.sum().sort_values(ascending=False).index[1:]]
X_cvec = X_cvec[X_cvec.sum().sort_values(ascending=False).index[1:]]

In [11]:
y_pred = gs.predict(X_test)
y_prob = gs.predict_proba(X_test)

In [26]:
gs.cv_results_

{'mean_fit_time': array([ 8.25839753, 10.16214428, 11.67415142, 11.81114564,  9.64531331,
        10.66667433, 14.78119259, 15.48889356, 12.40952201, 14.51752081,
        10.02088342, 11.35948858, 17.76909628, 18.91327362, 12.18870158,
        13.83443274,  9.40475311, 10.16512318, 11.21743517, 11.81729627,
        11.02310524, 11.61254706,  9.5114202 , 10.3686799 , 14.3278492 ,
        15.16083117, 12.04476061, 13.85853372,  9.37955666, 10.31699576,
        17.51653328, 19.21722097, 13.04698019, 15.40992146, 10.64395947,
        11.79824958, 11.96877432, 12.13632622, 11.2719944 , 11.69647155,
         9.54680681, 10.42145391, 14.4275476 , 15.55455589, 12.3697372 ,
        14.32517138,  9.67368431, 10.70337925, 17.87712502, 18.78123398,
        12.14525084, 14.17706628,  9.70622087, 10.49317389, 10.91665936,
        11.28667512, 10.76865273, 11.46612797,  9.79907146, 10.81130953,
        14.07181072, 14.81214352, 11.81312995, 13.35608897,  9.30089264,
        10.16164808, 17.21784239, 

In [14]:
print('most_important_features:', sebas.get_features(X_train, gs.best_estimator_.coef_.ravel()))

AttributeError: 'Pipeline' object has no attribute 'coef_'