# Baselines

Características:
* Embeddings
* Conjunto de datos con preprocesamiento

## Imports

In [32]:
import sys
sys.path.append('../../../Scripts/')

In [33]:
import pandas as pd, numpy as np
import embeddigs as emb
import baseline_model as b
import plots as p
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

## Paths

In [35]:
main_path = '../Datasets/CSV/Clean/'

In [36]:
# English
data_training_en_path = f'{main_path}data_training_en.csv'
data_test_en_path = f'{main_path}data_test_en.csv'

# Spanish
data_training_es_path = f'{main_path}data_training_es.csv'
data_test_es_path = f'{main_path}data_test_en.csv'

## Data

In [37]:
# English
data_training_en = pd.read_csv(data_training_en_path).dropna()
data_test_en = pd.read_csv(data_test_en_path).dropna()

# Spanish
data_training_es = pd.read_csv(data_training_es_path).dropna()
data_test_es = pd.read_csv(data_test_es_path).dropna()

In [38]:
data_training_gender_en = data_training_en[data_training_en.author != 'bot']
data_test_gender_en = data_test_en[data_test_en.author != 'bot']

data_training_gender_es = data_training_es[data_training_es.author != 'bot']
data_test_gender_es = data_test_es[data_test_es.author != 'bot']

## General params

In [39]:
seed = 42
target_names = ['human', 'bot']
gender_names = ['gender_1', 'gender_2']
en_wv_path = '../../../Models/Word vectors/fasttext_english_twitter_100d.vec'
es_wv_path = '../../../Models/Word vectors/fasttext_spanish_twitter_100d.vec'

## Logistic regression

In [40]:
# Defining model
lr = LogisticRegression(random_state = seed, penalty = 'l2', 
                        solver = 'liblinear', max_iter = 10000)

### Author

In [41]:
# English and Spanish
lr_en_author = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_en, 
                            data_test=data_test_en, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=lr, 
                            target_names=target_names)

lr_es_author = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_es, 
                            data_test=data_test_es, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=lr, 
                            target_names=target_names)

In [42]:
lr_model_author_en, lr_metrics_author_en = lr_en_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.72      0.63      0.67    127827
         bot       0.67      0.75      0.71    128150

    accuracy                           0.69    255977
   macro avg       0.69      0.69      0.69    255977
weighted avg       0.69      0.69      0.69    255977



In [43]:
lr_model_author_es, lr_metrics_author_es = lr_es_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.63      0.65      0.64    127827
         bot       0.64      0.62      0.63    128150

    accuracy                           0.63    255977
   macro avg       0.63      0.63      0.63    255977
weighted avg       0.63      0.63      0.63    255977



### Gender

In [13]:
# English and Spanish
lr_en_gender = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_en, 
                            data_test=data_test_gender_en, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=lr, 
                            target_names=gender_names)

lr_es_gender = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_es, 
                            data_test=data_test_gender_es, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=lr, 
                            target_names=gender_names)

In [14]:
lr_model_gender_en, lr_metrics_gender_en = lr_en_gender.word_embedding()

Loading embeddings
Sentence to tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)


Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=word_vectors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['vector'] = test['tokens'].apply(self.sentence_vector, word_vector=word_vectors)


Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.58      0.60      0.59     63802
    gender_2       0.59      0.58      0.58     64348

    accuracy                           0.59    128150
   macro avg       0.59      0.59      0.59    128150
weighted avg       0.59      0.59      0.59    128150



In [15]:
lr_model_gender_es, lr_metrics_gender_es = lr_es_gender.word_embedding()


Loading embeddings
Sentence to tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)


Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=word_vectors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['vector'] = test['tokens'].apply(self.sentence_vector, word_vector=word_vectors)


Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.58      0.55      0.56     63802
    gender_2       0.57      0.60      0.59     64348

    accuracy                           0.57    128150
   macro avg       0.57      0.57      0.57    128150
weighted avg       0.57      0.57      0.57    128150



## SVC

In [18]:
svc = CalibratedClassifierCV(LinearSVC(random_state = seed, penalty = 'l2', 
                                           max_iter = 10000))

### Author

In [19]:
# English and Spanish
svc_en_author = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_en, 
                            data_test=data_test_en, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=svc, 
                            target_names=target_names)

svc_es_author = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_es, 
                            data_test=data_test_es, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=svc, 
                            target_names=target_names)

In [20]:
svc_model_author_en, svc_metrics_author_en = svc_en_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.71      0.63      0.67    127827
         bot       0.67      0.75      0.71    128150

    accuracy                           0.69    255977
   macro avg       0.69      0.69      0.69    255977
weighted avg       0.69      0.69      0.69    255977



In [21]:
svc_model_author_es, svc_metrics_author_es = svc_es_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.63      0.65      0.64    127827
         bot       0.64      0.62      0.63    128150

    accuracy                           0.63    255977
   macro avg       0.63      0.63      0.63    255977
weighted avg       0.63      0.63      0.63    255977



### Gender

In [22]:
# English and Spanish
svc_en_gender = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_en, 
                            data_test=data_test_gender_en, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=svc, 
                            target_names=gender_names)

svc_es_gender = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_es, 
                            data_test=data_test_gender_es, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=svc, 
                            target_names=gender_names)

In [23]:
svc_model_gender_en, svc_metrics_gender_en = svc_en_gender.word_embedding()


Loading embeddings
Sentence to tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)


Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=word_vectors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['vector'] = test['tokens'].apply(self.sentence_vector, word_vector=word_vectors)


Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.58      0.60      0.59     63802
    gender_2       0.59      0.58      0.58     64348

    accuracy                           0.59    128150
   macro avg       0.59      0.59      0.59    128150
weighted avg       0.59      0.59      0.59    128150



In [24]:
svc_model_gender_es, svc_metrics_gender_es = svc_es_gender.word_embedding()

Loading embeddings
Sentence to tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)


Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=word_vectors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['vector'] = test['tokens'].apply(self.sentence_vector, word_vector=word_vectors)


Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.58      0.54      0.56     63802
    gender_2       0.57      0.60      0.59     64348

    accuracy                           0.57    128150
   macro avg       0.57      0.57      0.57    128150
weighted avg       0.57      0.57      0.57    128150



## Random forest

In [25]:
rf = RandomForestClassifier(max_depth = 100, random_state = seed)

### Author

In [26]:
# English and Spanish
rf_en_author = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_en, 
                            data_test=data_test_en, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=rf, 
                            target_names=target_names)

rf_es_author = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_es, 
                            data_test=data_test_es, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=rf, 
                            target_names=target_names)

In [27]:
rf_model_author_en, rf_metrics_author_en = rf_en_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.84      0.56      0.67    127827
         bot       0.67      0.89      0.77    128150

    accuracy                           0.73    255977
   macro avg       0.76      0.73      0.72    255977
weighted avg       0.76      0.73      0.72    255977



In [28]:
rf_model_author_es, rf_metrics_author_es = rf_es_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.72      0.37      0.49    127827
         bot       0.58      0.85      0.69    128150

    accuracy                           0.61    255977
   macro avg       0.65      0.61      0.59    255977
weighted avg       0.65      0.61      0.59    255977



### Gender

In [29]:
# English and Spanish
rf_en_gender = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_en, 
                            data_test=data_test_gender_en, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=rf, 
                            target_names=gender_names)

rf_es_gender = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_es, 
                            data_test=data_test_gender_es, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=rf, 
                            target_names=gender_names)

In [30]:
rf_model_gender_en, rf_metrics_gender_en = rf_en_gender.word_embedding()

Loading embeddings
Sentence to tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)


Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=word_vectors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['vector'] = test['tokens'].apply(self.sentence_vector, word_vector=word_vectors)


Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.58      0.58      0.58     63802
    gender_2       0.58      0.58      0.58     64348

    accuracy                           0.58    128150
   macro avg       0.58      0.58      0.58    128150
weighted avg       0.58      0.58      0.58    128150



In [31]:
rf_model_gender_es, rf_metrics_gender_es = rf_es_gender.word_embedding()

Loading embeddings
Sentence to tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)


Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=word_vectors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['vector'] = test['tokens'].apply(self.sentence_vector, word_vector=word_vectors)


Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.55      0.55      0.55     63802
    gender_2       0.55      0.55      0.55     64348

    accuracy                           0.55    128150
   macro avg       0.55      0.55      0.55    128150
weighted avg       0.55      0.55      0.55    128150

