In [160]:
import pandas as pd
import numpy as np

import sklearn
from sklearn import svm
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from hyperparameter_tuning import random_search

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from feature_builder import process_dataset, add_text_embeddings


In [162]:
train_dataset = pd.read_csv('train.csv')

In [163]:
test_dataset = pd.read_csv('test.csv')

In [164]:
y = train_dataset.loc[:,'target']

In [165]:
clf = svm.SVC()

In [161]:
param={'kernel':('linear', 'rbf', 'sigmoid', 'poly'),
    'C':[0.1,5,0.1],
    'degree':[1,10],
    'coef0':[0.0,10,0.1],
    'gamma':('auto','scale')}

Pruebo con tf-idf

In [142]:
v = TfidfVectorizer()
x_tfidf = v.fit_transform(train_dataset['text'])


In [143]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(x_tfidf, y, test_size = .33, random_state = 17)

In [158]:
clf.fit(x_train_tfidf, y_train_tfidf)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [159]:
y_pred_tfidf = clf.predict(x_test_tfidf)
f1_score(y_test_tfidf, y_pred_tfidf)

0.7466802860061288

Ahora voy a probar con nuestro generador de features que es un conjunto de embeddings con manualidades.

In [10]:
x_processed = process_dataset(train_dataset)

Percentage of words covered in the embeddings = 0.6336399642263958


In [11]:
x_train_processed, x_test_processed, y_train_processed, y_test_processed = train_test_split(x_processed, y, test_size = .33, random_state = 17)

In [12]:
clf.fit(x_train_processed, y_train_processed)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
y_pred_processed = clf.predict(x_test_processed)
f1_score(y_test_processed, y_pred_processed)

0.5270758122743683

Como no son muy buenos resultados voy a probar solo con los embeddings

In [115]:
x_embedd = train_dataset.copy()
add_text_embeddings(x_embedd)
x_embedd.drop(['text', 'location', 'keyword', 'id', 'target'], axis=1, inplace=True)

Percentage of words covered in the embeddings = 0.6336399642263958


Me quede solo con los embeddings ya que el resto deteriora el algoritmo por alguna razón.

In [166]:
x_train_embedd, x_test_embedd, y_train_embedd, y_test_embedd = train_test_split(x_embedd, y, test_size = .33, random_state = 17)

In [167]:
random_search(x_train_embedd,y_train_embedd,clf,param)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  5.4min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed: 91.5min finished



 Time taken: 1 hours 31 minutes and 40.01 seconds.

 Best f1 score with 5-folds and 40 combinations of hyperparameters:
0.7548812992789902

 Best hyperparameters:
{'kernel': 'rbf', 'gamma': 'scale', 'degree': 10, 'coef0': 10, 'C': 5}


In [120]:
clf.fit(x_train_embedd, y_train_embedd)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [121]:
y_pred_embedd = clf.predict(x_test_embedd)
f1_score(y_test_embedd, y_pred_embedd)

0.7602397602397603

=0.76, 0.74, 0.7447535383113714

Acá obtuve un resultado mucho mejor, por ende lo voy a entrenar de esta manera para todo el dataset y exportarlo a csv.

In [65]:
x_train = train_dataset.copy()
_add_text_embeddings(x_train)
x_train.drop(['text', 'location', 'keyword', 'id', 'target'], axis=1, inplace=True)

In [66]:
x_test = test_dataset.copy()
_add_text_embeddings(x_test)
x_test.drop(['text', 'location', 'keyword', 'id'], axis=1, inplace=True)

In [67]:
clf.fit(x_train, y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [68]:
y_pred_embedd = clf.predict(x_test)

In [69]:
ids = test_dataset['id']
final_df = pd.DataFrame({'target': [x for x in y_pred_embedd]}, index=ids)

In [71]:
final_df.to_csv('SVM-algo4.csv')