# Baselines

## Imports

In [1]:
import sys
sys.path.append('../../../Scripts/')

In [2]:
import pandas as pd, numpy as np
import baseline_model as b

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

## Paths

In [4]:
main_path = '../Datasets/CSV/Clean/'

In [5]:
# English
data_training_en_path = f'{main_path}data_training_en.csv'
data_test_en_path = f'{main_path}data_test_en.csv'

# Spanish
data_training_es_path = f'{main_path}data_training_es.csv'
data_test_es_path = f'{main_path}data_test_en.csv'

## Data

In [13]:
# English
data_training_en = pd.read_csv(data_training_en_path).dropna()
data_test_en = pd.read_csv(data_test_en_path).dropna()

# Spanish
data_training_es = pd.read_csv(data_training_es_path).dropna()
data_test_es = pd.read_csv(data_test_es_path).dropna()

## General params

In [42]:
seed = 42
target_names = ['author', 'gender']
gender_names = ['gender 1', 'gender 2', 'bot']

## Logistic regression

In [22]:
# Defining model
lr = LogisticRegression(random_state = seed, penalty = 'l2', 
                            solver = 'liblinear', max_iter = 1000)

### Author

In [46]:
# English
lr_en_author = b.Baseline(model=lr, 
                train_data=data_training_en, 
                test_data=data_test_en, 
                x_label_column='tweet',
                y_label_column='author',
                target_names=target_names)

# Spanish
lr_es_author = b.Baseline(model=lr, 
                train_data=data_training_es, 
                test_data=data_test_es, 
                x_label_column='tweet',
                y_label_column='author',
                target_names=target_names)

In [47]:
lr_model_author_en, lr_vectorizer_author_en = lr_en_author.baseline()

Fitting author model
              precision    recall  f1-score   support

      author       0.68      0.93      0.79      3338
      gender       0.85      0.47      0.61      2798

    accuracy                           0.72      6136
   macro avg       0.77      0.70      0.70      6136
weighted avg       0.76      0.72      0.71      6136



In [48]:
lr_model_author_es, lr_vectorizer_author_es = lr_es_author.baseline()

Fitting author model
              precision    recall  f1-score   support

      author       0.60      0.85      0.70      3338
      gender       0.64      0.31      0.42      2798

    accuracy                           0.60      6136
   macro avg       0.62      0.58      0.56      6136
weighted avg       0.61      0.60      0.57      6136



### Gender

In [49]:
# English
lr_en_gender = b.Baseline(model=lr, 
                train_data=data_training_en, 
                test_data=data_test_en, 
                x_label_column='tweet',
                y_label_column='gender',
                target_names=gender_names)

# Spanish
lr_es_gender = b.Baseline(model=lr, 
                train_data=data_training_es, 
                test_data=data_test_es, 
                x_label_column='tweet',
                y_label_column='gender',
                target_names=gender_names)

In [50]:
lr_model_gender_en, lr_vectorizer_gender_en = lr_en_gender.baseline()

Fitting gender model
              precision    recall  f1-score   support

    gender 1       0.20      0.36      0.26       960
    gender 2       0.58      0.43      0.49      2378
         bot       0.70      0.68      0.69      2798

    accuracy                           0.53      6136
   macro avg       0.50      0.49      0.48      6136
weighted avg       0.58      0.53      0.55      6136



In [51]:
lr_model_gender_es, lr_vectorizer_gender_es = lr_es_gender.baseline()


Fitting gender model
              precision    recall  f1-score   support

    gender 1       0.18      0.39      0.25       960
    gender 2       0.54      0.30      0.38      2378
         bot       0.56      0.55      0.56      2798

    accuracy                           0.43      6136
   macro avg       0.43      0.41      0.40      6136
weighted avg       0.49      0.43      0.44      6136



## Linear SVC

In [26]:
lsvc = CalibratedClassifierCV(LinearSVC(random_state = seed, penalty = 'l2'))

### Author

In [52]:
# English
lsvc_en_author = b.Baseline(model=lsvc, 
                train_data=data_training_en, 
                test_data=data_test_en, 
                x_label_column='tweet',
                y_label_column='author',
                target_names=target_names)

# Spanish
lsvc_es_author = b.Baseline(model=lsvc, 
                train_data=data_training_es, 
                test_data=data_test_es, 
                x_label_column='tweet',
                y_label_column='author',
                target_names=target_names)

In [53]:
lsvc_model_author_en, lsvc_vectorizer_author_en = lsvc_en_author.baseline()

Fitting author model
              precision    recall  f1-score   support

      author       0.71      0.86      0.78      3338
      gender       0.78      0.59      0.67      2798

    accuracy                           0.74      6136
   macro avg       0.75      0.72      0.72      6136
weighted avg       0.74      0.74      0.73      6136



In [54]:
lsvc_model_author_es, lsvc_vectorizer_author_es = lsvc_es_author.baseline()

Fitting author model
              precision    recall  f1-score   support

      author       0.61      0.82      0.70      3338
      gender       0.64      0.37      0.47      2798

    accuracy                           0.62      6136
   macro avg       0.62      0.60      0.58      6136
weighted avg       0.62      0.62      0.59      6136



### Gender

In [58]:
# English
lsvc_en_gender = b.Baseline(model=lsvc, 
                train_data=data_training_en, 
                test_data=data_test_en, 
                x_label_column='tweet',
                y_label_column='gender',
                target_names=gender_names)

# Spanish
lsvc_es_gender = b.Baseline(model=lsvc, 
                train_data=data_training_es, 
                test_data=data_test_es, 
                x_label_column='tweet',
                y_label_column='gender',
                target_names=gender_names)

In [59]:
lsvc_model_gender_en, lsvc_vectorizer_gender_en = lsvc_en_gender.baseline()


Fitting gender model
              precision    recall  f1-score   support

    gender 1       0.21      0.32      0.25       960
    gender 2       0.59      0.42      0.49      2378
         bot       0.67      0.71      0.69      2798

    accuracy                           0.54      6136
   macro avg       0.49      0.49      0.48      6136
weighted avg       0.57      0.54      0.55      6136



In [60]:
lsvc_model_gender_es, lsvc_vectorizer_gender_es = lsvc_es_gender.baseline()

Fitting gender model
              precision    recall  f1-score   support

    gender 1       0.18      0.40      0.24       960
    gender 2       0.50      0.32      0.39      2378
         bot       0.57      0.49      0.53      2798

    accuracy                           0.41      6136
   macro avg       0.42      0.41      0.39      6136
weighted avg       0.48      0.41      0.43      6136



## Random forest

In [67]:
rf = RandomForestClassifier(max_depth = 100, random_state = seed)

### Author

In [68]:
# English
rf_en_author = b.Baseline(model=rf, 
                train_data=data_training_en, 
                test_data=data_test_en, 
                x_label_column='tweet',
                y_label_column='author',
                target_names=target_names)

# Spanish
rf_es_author = b.Baseline(model=rf, 
                train_data=data_training_es, 
                test_data=data_test_es, 
                x_label_column='tweet',
                y_label_column='author',
                target_names=target_names)

In [69]:
rf_model_author_en, rf_vectorizer_author_en = rf_en_author.baseline()

Fitting author model
              precision    recall  f1-score   support

      author       0.63      0.99      0.77      3338
      gender       0.98      0.32      0.48      2798

    accuracy                           0.69      6136
   macro avg       0.81      0.66      0.63      6136
weighted avg       0.79      0.69      0.64      6136



In [70]:
rf_model_author_es, rf_vectorizer_author_es = rf_es_author.baseline()

Fitting author model
              precision    recall  f1-score   support

      author       0.56      0.94      0.70      3338
      gender       0.61      0.11      0.19      2798

    accuracy                           0.56      6136
   macro avg       0.58      0.53      0.44      6136
weighted avg       0.58      0.56      0.47      6136



### Gender

In [71]:
# English
rf_en_gender = b.Baseline(model=rf, 
                train_data=data_training_en, 
                test_data=data_test_en, 
                x_label_column='tweet',
                y_label_column='gender',
                target_names=gender_names)

# Spanish
rf_es_gender = b.Baseline(model=rf, 
                train_data=data_training_es, 
                test_data=data_test_es, 
                x_label_column='tweet',
                y_label_column='gender',
                target_names=gender_names)

In [72]:
rf_model_gender_en, rf_vectorizer_gender_en = rf_en_gender.baseline()

Fitting gender model
              precision    recall  f1-score   support

    gender 1       0.20      0.36      0.26       960
    gender 2       0.62      0.36      0.45      2378
         bot       0.64      0.70      0.67      2798

    accuracy                           0.52      6136
   macro avg       0.49      0.47      0.46      6136
weighted avg       0.57      0.52      0.52      6136



In [73]:
rf_model_gender_es, rf_vectorizer_gender_es = rf_es_gender.baseline()

Fitting gender model
              precision    recall  f1-score   support

    gender 1       0.00      0.00      0.00       960
    gender 2       0.20      0.00      0.00      2378
         bot       0.46      1.00      0.63      2798

    accuracy                           0.46      6136
   macro avg       0.22      0.33      0.21      6136
weighted avg       0.29      0.46      0.29      6136

