In [41]:
import pandas as pd
import numpy as np

import sklearn

from sklearn.naive_bayes  import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import svm
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import f1_score, accuracy_score


from feature_builder import process_dataset, add_text_embeddings, calculate_keyword_encoding
from hyperparameter_tuning import random_search



In [42]:
train_dataset = pd.read_csv('train.csv')

In [43]:
test_dataset = pd.read_csv('test.csv')

In [44]:
y = train_dataset.loc[:,'target']

In [45]:
logisticRegr = LogisticRegression(solver='liblinear', penalty='l1', multi_class='auto', max_iter=1000, C=1)

In [46]:
SVC = svm.SVC(degree=10,coef0=10,C=5)

In [47]:
catboost = CatBoostClassifier(verbose=False)

Variables procesadas

In [48]:
x_processed = process_dataset(train_dataset)

Percentage of words covered in the embeddings = 0.4937444933920705


In [49]:
x_train_processed, x_test_processed, y_train_processed, y_test_processed = train_test_split(x_processed, y, test_size = .30, random_state = 17)

Variables solo embeddings

In [50]:
x_embedd = train_dataset.copy()
add_text_embeddings(x_embedd, False, 'embeddings')
x_embedd.drop(['text', 'location', 'keyword', 'id', 'target'], axis=1, inplace=True)

Percentage of words covered in the embeddings = 0.4937444933920705


In [51]:
x_train_embedd, x_test_embedd, y_train_embedd, y_test_embedd = train_test_split(x_embedd, y, test_size = .30, random_state = 17)

Ensamble

In [52]:
VC = VotingClassifier(estimators=[('lr', logisticRegr), ('svc', SVC), ('catboost', catboost)], voting='hard')

Pruebo procesadas

In [53]:
VC.fit(x_train_processed, y_train_processed)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l1',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('svc',
                              SVC(C=5, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=10,
                                  decision_function_shape='ovr', degree=10,
                                  gamm

In [54]:
y_pred_processed = VC.predict(x_test_processed)

In [55]:
f1_score(y_test_processed, y_pred_processed)

0.7869198312236287

Pruebo solo con embedd

In [56]:
VC.fit(x_train_embedd, y_train_embedd)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l1',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('svc',
                              SVC(C=5, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=10,
                                  decision_function_shape='ovr', degree=10,
                                  gamm

In [57]:
y_pred_embedd = VC.predict(x_test_embedd)

In [58]:
f1_score(y_test_embedd, y_pred_embedd)

0.7819548872180452

Dado el resultado, paso a exportar el cvs con el VC entrenado con todo el train_set.

In [None]:
x_test_proccesed = process_dataset(test_dataset)

In [39]:
VC.fit(x_processed, y)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l1',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('svc',
                              SVC(C=5, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=10,
                                  decision_function_shape='ovr', degree=10,
                                  gamm

In [32]:
y_pred = VC.predict(x_test_proccesed)

In [33]:
ids = test_dataset['id']
final_df = pd.DataFrame({'target': [x for x in y_pred]}, index=ids)

In [34]:
final_df.to_csv('VC-SVM-CB-LR-processed.csv')