# Embeddings

Características:
* Embeddings
* Conjunto de datos sin preprocesamiento

## Imports

In [1]:
import sys
sys.path.append('../../../Scripts/')

In [2]:
import pandas as pd, numpy as np
import embeddigs as emb
import baseline_model as b
import plots as p
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

## Paths

In [4]:
main_path = '../Datasets/CSV/'

In [5]:
# English
data_training_en_path = f'{main_path}pan19-author-profiling-training-2019-02-18-en.csv'
data_test_en_path = f'{main_path}pan19-author-profiling-test-2019-04-29-en.csv'

# Spanish
data_training_es_path = f'{main_path}pan19-author-profiling-training-2019-02-18-es.csv'
data_test_es_path = f'{main_path}pan19-author-profiling-test-2019-04-29-es.csv'

## Data

In [6]:
# English
data_training_en = pd.read_csv(data_training_en_path).dropna()
data_test_en = pd.read_csv(data_test_en_path).dropna()

# Spanish
data_training_es = pd.read_csv(data_training_es_path).dropna()
data_test_es = pd.read_csv(data_test_es_path).dropna()

In [7]:
data_training_gender_en = data_training_en[data_training_en.author != 'bot']
data_test_gender_en = data_test_en[data_test_en.author != 'bot']

data_training_gender_es = data_training_es[data_training_es.author != 'bot']
data_test_gender_es = data_test_es[data_test_es.author != 'bot']

## General params

In [8]:
seed = 42
target_names = ['human', 'bot']
gender_names = ['gender_1', 'gender_2']
en_wv_path = '../../../Models/Word vectors/fasttext_english_twitter_100d.vec'
es_wv_path = '../../../Models/Word vectors/fasttext_spanish_twitter_100d.vec'

## Logistic regression

In [9]:
# Defining model
lr = LogisticRegression(random_state = seed, penalty = 'l2', 
                        solver = 'liblinear', max_iter = 10000)

### Author

In [10]:
# English and Spanish
lr_en_author = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_en, 
                            data_test=data_test_en, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=lr, 
                            target_names=target_names)

lr_es_author = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_es, 
                            data_test=data_test_es, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=lr, 
                            target_names=target_names)

In [11]:
lr_model_author_en, lr_metrics_author_en = lr_en_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.71      0.47      0.57      2800
         bot       0.66      0.85      0.74      3452

    accuracy                           0.68      6252
   macro avg       0.69      0.66      0.66      6252
weighted avg       0.69      0.68      0.67      6252



In [12]:
lr_model_author_es, lr_metrics_author_es = lr_es_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.68      0.68      0.68     32437
         bot       0.65      0.65      0.65     29600

    accuracy                           0.67     62037
   macro avg       0.67      0.67      0.67     62037
weighted avg       0.67      0.67      0.67     62037



### Gender

In [13]:
# English and Spanish
lr_en_gender = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_en, 
                            data_test=data_test_gender_en, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=lr, 
                            target_names=gender_names)

lr_es_gender = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_es, 
                            data_test=data_test_gender_es, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=lr, 
                            target_names=gender_names)

In [14]:
lr_model_gender_en, lr_metrics_gender_en = lr_en_gender.word_embedding()

Loading embeddings
Sentence to tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)


Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=word_vectors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['vector'] = test['tokens'].apply(self.sentence_vector, word_vector=word_vectors)


Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.75      0.50      0.60      2452
    gender_2       0.32      0.59      0.42      1000

    accuracy                           0.53      3452
   macro avg       0.54      0.54      0.51      3452
weighted avg       0.63      0.53      0.55      3452



In [15]:
lr_model_gender_es, lr_metrics_gender_es = lr_es_gender.word_embedding()


Loading embeddings
Sentence to tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)


Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=word_vectors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['vector'] = test['tokens'].apply(self.sentence_vector, word_vector=word_vectors)


Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.57      0.42      0.48     14000
    gender_2       0.58      0.71      0.64     15600

    accuracy                           0.58     29600
   macro avg       0.57      0.57      0.56     29600
weighted avg       0.57      0.58      0.57     29600



## SVC

In [16]:
svc = SVC(kernel='linear', random_state = seed)

### Author

In [17]:
# English and Spanish
svc_en_author = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_en, 
                            data_test=data_test_en, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=svc, 
                            target_names=target_names)

svc_es_author = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_es, 
                            data_test=data_test_es, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=svc, 
                            target_names=target_names)

In [18]:
svc_model_author_en, svc_metrics_author_en = svc_en_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.70      0.43      0.54      2800
         bot       0.65      0.85      0.74      3452

    accuracy                           0.66      6252
   macro avg       0.68      0.64      0.64      6252
weighted avg       0.67      0.66      0.65      6252



In [19]:
svc_model_author_es, svc_metrics_author_es = svc_es_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.69      0.68      0.68     32437
         bot       0.65      0.67      0.66     29600

    accuracy                           0.67     62037
   macro avg       0.67      0.67      0.67     62037
weighted avg       0.67      0.67      0.67     62037



### Gender

In [20]:
# English and Spanish
svc_en_gender = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_en, 
                            data_test=data_test_gender_en, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=svc, 
                            target_names=gender_names)

svc_es_gender = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_es, 
                            data_test=data_test_gender_es, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=svc, 
                            target_names=gender_names)

In [21]:
svc_model_gender_en, svc_metrics_gender_en = svc_en_gender.word_embedding()


Loading embeddings
Sentence to tokens
Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=w

Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.75      0.49      0.59      2452
    gender_2       0.32      0.60      0.42      1000

    accuracy                           0.52      3452
   macro avg       0.54      0.54      0.50      3452
weighted avg       0.62      0.52      0.54      3452



In [30]:
svc_model_gender_es, svc_metrics_gender_es = svc_es_gender.word_embedding()

Loading embeddings
Sentence to tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)


Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=word_vectors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['vector'] = test['tokens'].apply(self.sentence_vector, word_vector=word_vectors)


Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.58      0.38      0.46     14000
    gender_2       0.58      0.75      0.65     15600

    accuracy                           0.58     29600
   macro avg       0.58      0.57      0.56     29600
weighted avg       0.58      0.58      0.56     29600



## Random forest

In [23]:
rf = RandomForestClassifier(max_depth = 100, random_state = seed)

### Author

In [24]:
# English and Spanish
rf_en_author = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_en, 
                            data_test=data_test_en, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=rf, 
                            target_names=target_names)

rf_es_author = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_es, 
                            data_test=data_test_es, 
                            x_label_column='tweet', 
                            y_label_column='author', 
                            ai_model=rf, 
                            target_names=target_names)

In [25]:
rf_model_author_en, rf_metrics_author_en = rf_en_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.84      0.41      0.55      2800
         bot       0.66      0.94      0.78      3452

    accuracy                           0.70      6252
   macro avg       0.75      0.67      0.66      6252
weighted avg       0.74      0.70      0.67      6252



In [26]:
rf_model_author_es, rf_metrics_author_es = rf_es_author.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors
Transforming train and test data
Training model
              precision    recall  f1-score   support

       human       0.71      0.65      0.68     32437
         bot       0.65      0.70      0.67     29600

    accuracy                           0.68     62037
   macro avg       0.68      0.68      0.68     62037
weighted avg       0.68      0.68      0.68     62037



### Gender

In [27]:
# English and Spanish
rf_en_gender = emb.Embeddings(embedding_path=en_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_en, 
                            data_test=data_test_gender_en, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=rf, 
                            target_names=gender_names)

rf_es_gender = emb.Embeddings(embedding_path=es_wv_path, 
                            is_w2v_format=True, 
                            data_train=data_training_gender_es, 
                            data_test=data_test_gender_es, 
                            x_label_column='tweet', 
                            y_label_column='gender', 
                            ai_model=rf, 
                            target_names=gender_names)

In [28]:
rf_model_gender_en, rf_metrics_gender_en = rf_en_gender.word_embedding()

Loading embeddings
Sentence to tokens
Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=w

Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.73      0.48      0.58      2452
    gender_2       0.31      0.57      0.40      1000

    accuracy                           0.51      3452
   macro avg       0.52      0.53      0.49      3452
weighted avg       0.61      0.51      0.53      3452



In [29]:
rf_model_gender_es, rf_metrics_gender_es = rf_es_gender.word_embedding()

Loading embeddings
Sentence to tokens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tokens'] = train[self.x_label_column].apply(self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tokens'] = test[self.x_label_column].apply(self.tokenize)


Tokens to word vectors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['vector'] = train['tokens'].apply(self.sentence_vector, word_vector=word_vectors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['vector'] = test['tokens'].apply(self.sentence_vector, word_vector=word_vectors)


Transforming train and test data
Training model
              precision    recall  f1-score   support

    gender_1       0.55      0.45      0.49     14000
    gender_2       0.57      0.67      0.62     15600

    accuracy                           0.56     29600
   macro avg       0.56      0.56      0.55     29600
weighted avg       0.56      0.56      0.56     29600

