In [1]:
import sys
sys.path.append('../../../../Scripts/')

In [2]:
import text_preprocessing_v2 as tp2
import cleaning_twitter_data as ctd
import baseline_model as base
import pandas as pd, numpy as np

[nltk_data] Downloading package stopwords to /Users/mash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data

In [3]:
data_training_path = '../Datasets/homomex_training.csv'

## Preprocessing

In [4]:
data = pd.read_csv(data_training_path)

In [5]:
dataframes = [data]
columns = ['tweets']
dictionary_list = [{'P': 0, 'NP': 1, 'NR': 2}]

In [6]:
clean = ctd.CleaningTwitterData(csv_path=data_training_path,
                                text_column='tweets', 
                                language='spanish', 
                                remove_stopwords=False, 
                                is_dataframe=True, 
                                emoji_path=None, 
                                dataframes=dataframes, 
                                columns=columns, 
                                dictionary_list=dictionary_list)

In [7]:
data = clean.clean_twitter_data(csv_path=data_training_path,
                                text_column='tweets', 
                                language='spanish')

In [8]:
data.label = data.label.fillna('NR')

In [9]:
data.label.replace(list(dictionary_list[0].keys()), 
            list(dictionary_list[0].values()), 
            inplace=True)

In [10]:
data.to_csv('../Datasets/Clean datasets/homomex_training.csv')

## Baseline

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [12]:
seed = 42
model = LogisticRegression(random_state = seed, 
                            penalty = 'l2', 
                            solver = 'liblinear', 
                            max_iter = 1000)
X_train, X_val, y_train, y_val = train_test_split(data.tweets, 
                                                  data.label, 
                                                  test_size=0.05, 
                                                  random_state=seed)
target_names = ['LGBT+phobic (P)', 'Not LGBT+phobic (NP)', 'Not LGBT+related (NR)']

In [13]:
training_data = {'tweets': X_train.tolist(),
              'label': y_train.tolist()}
val_data = {'tweets': X_val.tolist(),
              'label': y_val.tolist()}

In [14]:
training_data = pd.DataFrame(training_data, columns=['tweets', 'label'])
val_data = pd.DataFrame(val_data, columns=['tweets', 'label'])

In [15]:
training_data.label.value_counts(), val_data.label.value_counts()

(1    4145
 2    1689
 0     816
 Name: label, dtype: int64,
 1    215
 2     89
 0     46
 Name: label, dtype: int64)

In [16]:
b = base.Baseline(train_data=training_data, 
                  test_data=val_data, 
                  x_label_column='tweets', 
                  y_label_column='label', 
                  target_names=target_names)

In [17]:
model, vectorizer = b.baseline(model=model, 
                               train_data=training_data, 
                               test_data=val_data, 
                               x_label_column='tweets', 
                               y_label_column='label', 
                               target_names=target_names)

Fitting label model
                       precision    recall  f1-score   support

      LGBT+phobic (P)       0.70      0.30      0.42        46
 Not LGBT+phobic (NP)       0.80      0.92      0.86       215
Not LGBT+related (NR)       0.78      0.73      0.76        89

             accuracy                           0.79       350
            macro avg       0.76      0.65      0.68       350
         weighted avg       0.78      0.79      0.77       350

