# Baselines

Características:
* Embeddings
* Conjunto de datos con preprocesamiento

## Imports

In [1]:
import sys
sys.path.append('../../../Scripts/')

In [2]:
import pandas as pd, numpy as np
import embeddigs as emb
import baseline_model as b
import plots as p
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

## Paths

In [4]:
main_path = '../Datasets/CSV/'

In [5]:
# English
data_training_en_path = f'{main_path}Train-EN.csv'
data_test_en_path = f'{main_path}Test-EN.csv'

# Spanish
data_training_es_path = f'{main_path}Train-ES.csv'
data_test_es_path = f'{main_path}Test-ES.csv'

## Data

In [6]:
# English
data_training_en = pd.read_csv(data_training_en_path).dropna()
data_test_en = pd.read_csv(data_test_en_path).dropna()

# Spanish
data_training_es = pd.read_csv(data_training_es_path).dropna()
data_test_es = pd.read_csv(data_test_es_path).dropna()

## General params

In [7]:
seed = 42
target_names = ['0', '1']
en_wv_path = '../../../Models/Word vectors/fasttext_english_twitter_100d.vec'
es_wv_path = '../../../Models/Word vectors/fasttext_spanish_twitter_100d.vec'

## Logistic regression

In [8]:
# Defining model
lr = LogisticRegression(random_state = seed, penalty = 'l2', solver = 'liblinear', max_iter = 10000)

### Author

In [9]:
# English and Spanish
lr_en_fake = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_en, 
                            data_test=data_test_en, 
                            x_label_column='tweet', 
                            y_label_column='label', 
                            ai_model=lr, 
                            target_names=target_names)

lr_es_fake = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_es, 
                            data_test=data_test_es, 
                            x_label_column='tweet', 
                            y_label_column='label', 
                            ai_model=lr, 
                            target_names=target_names)

In [10]:
lr_model_fake_en, lr_metrics_fake_en = lr_en_fake.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

           0       0.59      0.48      0.53     10000
           1       0.56      0.66      0.61     10000

    accuracy                           0.57     20000
   macro avg       0.57      0.57      0.57     20000
weighted avg       0.57      0.57      0.57     20000



In [11]:
lr_model_fake_es, lr_metrics_fake_es = lr_es_fake.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

           0       0.59      0.71      0.65     10000
           1       0.64      0.51      0.57     10000

    accuracy                           0.61     20000
   macro avg       0.62      0.61      0.61     20000
weighted avg       0.62      0.61      0.61     20000



## SVC

In [12]:
svc = CalibratedClassifierCV(LinearSVC(random_state = seed, penalty = 'l2', max_iter = 10000))

### Author

In [13]:
# English and Spanish
svc_en_fake = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_en, 
                            data_test=data_test_en, 
                            x_label_column='tweet', 
                            y_label_column='label', 
                            ai_model=svc, 
                            target_names=target_names)

svc_es_fake = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_es, 
                            data_test=data_test_es, 
                            x_label_column='tweet', 
                            y_label_column='label', 
                            ai_model=svc, 
                            target_names=target_names)

In [14]:
svc_model_fake_en, svc_metrics_fake_en = svc_en_fake.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

           0       0.59      0.50      0.54     10000
           1       0.56      0.65      0.60     10000

    accuracy                           0.57     20000
   macro avg       0.57      0.57      0.57     20000
weighted avg       0.57      0.57      0.57     20000



In [15]:
svc_model_fake_es, svc_metrics_fake_es = svc_es_fake.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

           0       0.59      0.71      0.64     10000
           1       0.64      0.52      0.57     10000

    accuracy                           0.61     20000
   macro avg       0.62      0.61      0.61     20000
weighted avg       0.62      0.61      0.61     20000



## Random forest

In [16]:
rf = RandomForestClassifier(max_depth = 100, random_state = seed)

### Author

In [17]:
# English and Spanish
rf_en_fake = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_en, 
                            data_test=data_test_en, 
                            x_label_column='tweet', 
                            y_label_column='label', 
                            ai_model=rf, 
                            target_names=target_names)

rf_es_fake = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_es, 
                            data_test=data_test_es, 
                            x_label_column='tweet', 
                            y_label_column='label', 
                            ai_model=rf, 
                            target_names=target_names)

In [18]:
rf_model_fake_en, rf_metrics_fake_en = rf_en_fake.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

           0       0.57      0.60      0.58     10000
           1       0.57      0.54      0.56     10000

    accuracy                           0.57     20000
   macro avg       0.57      0.57      0.57     20000
weighted avg       0.57      0.57      0.57     20000



In [19]:
rf_model_fake_es, rf_metrics_fake_es = rf_es_fake.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

           0       0.59      0.69      0.64     10000
           1       0.63      0.52      0.57     10000

    accuracy                           0.61     20000
   macro avg       0.61      0.61      0.60     20000
weighted avg       0.61      0.61      0.60     20000

