# Baselines

Características:
* Rango de n-gramas: 3
* Frecuencia mínima de aparición: 3
* Conjunto de datos con preprocesamiento

## Imports

In [4]:
import sys
sys.path.append('../../../Scripts/')

In [5]:
import pandas as pd, numpy as np
import baseline_model as b
import plots as p
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

## Paths

In [7]:
main_path = '../Datasets/CSV/Clean/'

In [8]:
# English
data_training_en_path = f'{main_path}data_training_en_lemma.csv'
data_test_en_path = f'{main_path}data_test_en_lemma.csv'

# Spanish
data_training_es_path = f'{main_path}data_training_es_lemma.csv'
data_test_es_path = f'{main_path}data_test_es_lemma.csv'

## Data

In [9]:
# English
data_training_en = pd.read_csv(data_training_en_path).dropna()
data_test_en = pd.read_csv(data_test_en_path).dropna()

# Spanish
data_training_es = pd.read_csv(data_training_es_path).dropna()
data_test_es = pd.read_csv(data_test_es_path).dropna()

In [10]:
data_training_gender_en = data_training_en[data_training_en.author != 'bot']
data_test_gender_en = data_test_en[data_test_en.author != 'bot']

data_training_gender_es = data_training_es[data_training_es.author != 'bot']
data_test_gender_es = data_test_es[data_test_es.author != 'bot']

In [11]:
data_training_en.shape[0], data_test_en.shape[0]

(409106, 261551)

## General params

In [12]:
seed = 42
target_names = ['human', 'bot']
gender_names = ['gender_1', 'gender_2']
ngram_ranges = [(1, 1), (1,2), (1, 3)]
min_dfs = [1, 3]
models = [LogisticRegression(random_state = seed, penalty = 'l2', solver = 'liblinear', max_iter = 10000),
          CalibratedClassifierCV(LinearSVC(random_state = seed, penalty = 'l2', max_iter = 10000)),
          RandomForestClassifier(max_depth = 100, random_state = seed)]
model_names = ['LR', 'LSVC', 'RF']

## Author

In [11]:
models_author_en, vectorizers_author_en, metrics_author_en = [], [], []

for i, model in enumerate(models):
    for min_df in min_dfs:
        for ngram_range in ngram_ranges:
            # Creating objects
            author_en = b.Baseline(model=model, 
                                train_data=data_training_en, 
                                test_data=data_test_en, 
                                x_label_column='tweet',
                                y_label_column='author',
                                ngram_range=ngram_range,
                                min_df=min_df,
                                model_name=model_names[i],
                                target_names=target_names)
            
            # Running baselines
            model, vectorizer, metrics = author_en.baseline()
            
            # Saving info
            models_author_en.append(model)
            vectorizers_author_en.append(vectorizer)
            metrics_author_en.append(metrics) 

Model LR (author-(1, 1)-1) --> Accuracy: 0.7341627445507759
Model LR (author-(1, 2)-1) --> Accuracy: 0.7443825487189879
Model LR (author-(1, 3)-1) --> Accuracy: 0.743958157300106
Model LR (author-(1, 1)-3) --> Accuracy: 0.7341971546658205
Model LR (author-(1, 2)-3) --> Accuracy: 0.7474679890346433
Model LR (author-(1, 3)-3) --> Accuracy: 0.7459807073954984
Model LSVC (author-(1, 1)-1) --> Accuracy: 0.7313602318477085
Model LSVC (author-(1, 2)-1) --> Accuracy: 0.7494790690916877
Model LSVC (author-(1, 3)-1) --> Accuracy: 0.7516889631467668
Model LSVC (author-(1, 1)-3) --> Accuracy: 0.7310161306972637
Model LSVC (author-(1, 2)-3) --> Accuracy: 0.746309515161479
Model LSVC (author-(1, 3)-3) --> Accuracy: 0.748584406100531
Model RF (author-(1, 1)-1) --> Accuracy: 0.7056138191022019
Model RF (author-(1, 2)-1) --> Accuracy: 0.6930885372260095
Model RF (author-(1, 3)-1) --> Accuracy: 0.6753443879014035
Model RF (author-(1, 1)-3) --> Accuracy: 0.7025819056321712
Model RF (author-(1, 2)-3) --> 

In [12]:
models_author_es, vectorizers_author_es, metrics_author_es = [], [], []

for i, model in enumerate(models):
    for min_df in min_dfs:
        for ngram_range in ngram_ranges:
            # Spanish
            author_es = b.Baseline(model=model, 
                            train_data=data_training_es, 
                            test_data=data_test_es, 
                            x_label_column='tweet',
                            y_label_column='author',
                            ngram_range=ngram_range,
                            min_df=min_df,
                            model_name=model_names[i],
                            target_names=target_names)
            
            # Running baselines
            model, vectorizer, metrics = author_es.baseline()
            
            # Saving info
            models_author_es.append(model)
            vectorizers_author_es.append(vectorizer)
            metrics_author_es.append(metrics) 

Model LR (author-(1, 1)-1) --> Accuracy: 0.6902558094122242
Model LR (author-(1, 2)-1) --> Accuracy: 0.6956063268892795
Model LR (author-(1, 3)-1) --> Accuracy: 0.6973972717382207
Model LR (author-(1, 1)-3) --> Accuracy: 0.6899824253075572
Model LR (author-(1, 2)-3) --> Accuracy: 0.6903729740285101
Model LR (author-(1, 3)-3) --> Accuracy: 0.6919798030518592
Model LSVC (author-(1, 1)-1) --> Accuracy: 0.675883616481156
Model LSVC (author-(1, 2)-1) --> Accuracy: 0.6895695595168354
Model LSVC (author-(1, 3)-1) --> Accuracy: 0.694897759924122
Model LSVC (author-(1, 1)-3) --> Accuracy: 0.6764805980974699
Model LSVC (author-(1, 2)-3) --> Accuracy: 0.6789856891790108
Model LSVC (author-(1, 3)-3) --> Accuracy: 0.6813568778419393
Model RF (author-(1, 1)-1) --> Accuracy: 0.6637766061316149
Model RF (author-(1, 2)-1) --> Accuracy: 0.644282645688621
Model RF (author-(1, 3)-1) --> Accuracy: 0.6269088068736575
Model RF (author-(1, 1)-3) --> Accuracy: 0.6659302033642983
Model RF (author-(1, 2)-3) --> 

## Gender

In [13]:
models_gender_en, vectorizers_gender_en, metrics_gender_en = [], [], []

for i, model in enumerate(models):
    for min_df in min_dfs:
        for ngram_range in ngram_ranges:
            # Spanish
            gender_en = b.Baseline(model=model, 
                            train_data=data_training_gender_en, 
                            test_data=data_test_gender_en, 
                            x_label_column='tweet',
                            y_label_column='gender',
                            ngram_range=ngram_range,
                            min_df=min_df,
                            model_name=model_names[i],
                            target_names=gender_names)
            
            # Running baselines
            model, vectorizer, metrics = gender_en.baseline()
            
            # Saving info
            models_gender_en.append(model)
            vectorizers_gender_en.append(vectorizer)
            metrics_gender_en.append(metrics) 

Model LR (gender-(1, 1)-1) --> Accuracy: 0.587073704435075
Model LR (gender-(1, 2)-1) --> Accuracy: 0.5895973589307634
Model LR (gender-(1, 3)-1) --> Accuracy: 0.5891551475689811
Model LR (gender-(1, 1)-3) --> Accuracy: 0.5863341440541633
Model LR (gender-(1, 2)-3) --> Accuracy: 0.5884003385204217
Model LR (gender-(1, 3)-3) --> Accuracy: 0.5893762532498723
Model LSVC (gender-(1, 1)-1) --> Accuracy: 0.5805853963509938
Model LSVC (gender-(1, 2)-1) --> Accuracy: 0.58312429951433
Model LSVC (gender-(1, 3)-1) --> Accuracy: 0.5867077364115311
Model LSVC (gender-(1, 1)-3) --> Accuracy: 0.5807836290304135
Model LSVC (gender-(1, 2)-3) --> Accuracy: 0.5774212978141035
Model LSVC (gender-(1, 3)-3) --> Accuracy: 0.5787021858965072
Model RF (gender-(1, 1)-1) --> Accuracy: 0.5688667952637638
Model RF (gender-(1, 2)-1) --> Accuracy: 0.567677399187246
Model RF (gender-(1, 3)-1) --> Accuracy: 0.5619591488193719
Model RF (gender-(1, 1)-3) --> Accuracy: 0.5662821460974847
Model RF (gender-(1, 2)-3) --> A

In [14]:
models_gender_es, vectorizers_gender_es, metrics_gender_es = [], [], []

for i, model in enumerate(models):
    for min_df in min_dfs:
        for ngram_range in ngram_ranges:
            # Spanish
            gender_es = b.Baseline(model=model, 
                            train_data=data_training_gender_es, 
                            test_data=data_test_gender_es, 
                            x_label_column='tweet',
                            y_label_column='gender',
                            ngram_range=ngram_range,
                            min_df=min_df,
                            model_name=model_names[i],
                            target_names=gender_names)
            
            # Running baselines
            model, vectorizer, metrics = gender_es.baseline()
            
            # Saving info
            models_gender_es.append(model)
            vectorizers_gender_es.append(vectorizer)
            metrics_gender_es.append(metrics) 

Model LR (gender-(1, 1)-1) --> Accuracy: 0.5675267421563801
Model LR (gender-(1, 2)-1) --> Accuracy: 0.5705366333974846
Model LR (gender-(1, 3)-1) --> Accuracy: 0.5720583627981918
Model LR (gender-(1, 1)-3) --> Accuracy: 0.5685337689656715
Model LR (gender-(1, 2)-3) --> Accuracy: 0.5686680392069104
Model LR (gender-(1, 3)-3) --> Accuracy: 0.5697981470706709
Model LSVC (gender-(1, 1)-1) --> Accuracy: 0.5563151770129346
Model LSVC (gender-(1, 2)-1) --> Accuracy: 0.5628160945262498
Model LSVC (gender-(1, 3)-1) --> Accuracy: 0.5664525802264692
Model LSVC (gender-(1, 1)-3) --> Accuracy: 0.5586537170478449
Model LSVC (gender-(1, 2)-3) --> Accuracy: 0.557814528040102
Model LSVC (gender-(1, 3)-3) --> Accuracy: 0.5592355547598801
Model RF (gender-(1, 1)-1) --> Accuracy: 0.5712527413507587
Model RF (gender-(1, 2)-1) --> Accuracy: 0.566586850467708
Model RF (gender-(1, 3)-1) --> Accuracy: 0.5655350669113369
Model RF (gender-(1, 1)-3) --> Accuracy: 0.5679966880007161
Model RF (gender-(1, 2)-3) -->