## Imports

In [1]:
!git clone https://github.com/CCogS-Mx/text-preprocessing.git

fatal: destination path 'text-preprocessing' already exists and is not an empty directory.


In [2]:
import sys 
sys.path.append('Scripts/')
sys.path.append('text-preprocessing/script/')

In [16]:
import os
import pandas as pd
import numpy as np

# Import classes
import text_preprocessing as tp
import feature_extraction as fe
import robust_classic_model as rcm
import cleaning_twitter_data as ctd

# Algoritmos de ML
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split

# Vectorizadores
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# PCA
from sklearn.decomposition import PCA

## Data

In [4]:
training = pd.read_csv('Data/homomex_training.csv').drop('index', axis=1)
test = pd.read_csv('Data/track1_test_no_labels.csv').drop('Unnamed: 0', axis=1)

In [5]:
prep = tp.Preprocessing('spanish')

In [6]:
training_p = prep.main_preprocess(data=training, 
                                  column='tweets', 
                                  tweet=True, 
                                  tweet_tags=True, 
                                  remove_stop_words=False, 
                                  lemmatize=False, 
                                  translate_emojis=True)

In [7]:
training_p.to_csv('Data/task1_preprocessed.csv', index=False)

In [8]:
training_p.label = training_p.label.fillna('NR')
dictionary_list = [{'P': 0, 'NP': 1, 'NR': 2}]
training_p.label.replace(list(dictionary_list[0].keys()), 
            list(dictionary_list[0].values()), 
            inplace=True)


In [9]:
test_p = prep.main_preprocess(data=test, 
                                  column='content', 
                                  tweet=True, 
                                  tweet_tags=True, 
                                  remove_stop_words=False, 
                                  lemmatize=False, 
                                  translate_emojis=True)

## Improving best classifiers with training data only

In [10]:
from sklearn.metrics import f1_score
from sklearn.calibration import CalibratedClassifierCV

In [11]:
vectorizer = CountVectorizer(ngram_range = (1,1), lowercase = False)
oth_feats_vectorizer = CountVectorizer(ngram_range = (1,1), lowercase = False)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(training_p, 
                                                    training_p['label'], 
                                                    test_size=0.05, 
                                                    random_state=42)

In [13]:
# Train
f_train = fe.FeatureExtraction(data = X_train, 
                                text_column = 'tweets', 
                                lemma = True, 
                                pos = True, 
                                tag = True,                                
                                other_features = True,
                                vectorizer = vectorizer,
                                oth_feats_vectorizer = oth_feats_vectorizer,
                                language = 'es')

X = f_train.add_features_to_test()

In [14]:
# Test
f_train = fe.FeatureExtraction(data = X_test, 
                                text_column = 'tweets', 
                                lemma = True, 
                                pos = True, 
                                tag = True,                                
                                other_features = True,
                                vectorizer = vectorizer,
                                oth_feats_vectorizer = oth_feats_vectorizer,
                                language = 'es')

x = f_train.add_features_to_test()

In [15]:
y_train = y_train.tolist()
y_test = y_test.tolist()

### Unigramas

In [25]:
models_count = [LogisticRegression(C=1.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=50, n_estimators=10,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=0.1, max_iter=100000, penalty='l2', random_state=42)), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
models_tfidf = [LogisticRegression(C=10.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=10,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=1.0, max_iter=1000, penalty='l2', random_state=42)), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
model_names = ['Logistic Regression', 'Random Forest', 'Linear SVC', 'KNN']


In [26]:
pca = PCA(n_components=1000, random_state=42)
vectorizers = [CountVectorizer(ngram_range = (1,1), lowercase = False), 
              TfidfVectorizer(ngram_range = (1,1), lowercase = False)]

In [27]:
for idx in range(2):
    if idx == 0:
        print('Count Vectorizer')
    else:
        print('TF-IDF Vectorizer')

    for i, model in enumerate(models_count):
        count_v = vectorizers[idx].fit(X['tweets'])

        # Transforma tus datos de entrenamiento y prueba utilizando el vectorizador 
        # CountVectorizer ajustado
        X_train_count = count_v.transform(X['tweets'])
        X_test_count = count_v.transform(x['tweets'])

        # Ajusta el modelo PCA con los datos de entrenamiento
        pca.fit(X_train_count.toarray())

        # Transforma tus datos de entrenamiento y prueba utilizando el modelo PCA ajustado
        X_train_pca = pca.transform(X_train_count.toarray())
        X_test_pca = pca.transform(X_test_count.toarray())

        model.fit(X_train_pca, y_train)
        y_pred = model.predict(X_test_pca)
        print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

Count Vectorizer
Model: Logistic Regression, f1: 0.709
Model: Random Forest, f1: 0.3876
Model: Linear SVC, f1: 0.6907
Model: KNN, f1: 0.4101
TF-IDF Vectorizer
Model: Logistic Regression, f1: 0.6455
Model: Random Forest, f1: 0.381
Model: Linear SVC, f1: 0.6858
Model: KNN, f1: 0.506


### Unigramas y bigramas

In [28]:
models_count = [LogisticRegression(C=0.1, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=50, n_estimators=10,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=0.1, max_iter=100000, penalty='l2', random_state=42)),  
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
models_tfidf = [LogisticRegression(C=10.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=10,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=1.0, max_iter=1000, penalty='l2', random_state=42)),  
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
model_names = ['Logistic Regression', 'Random Forest', 'Linear SVC', 'KNN']
vectorizers = [CountVectorizer(ngram_range = (1,2), lowercase = False), 
              TfidfVectorizer(ngram_range = (1,2), lowercase = False)]

In [29]:
for idx in range(2):
    if idx == 0:
        print('Count Vectorizer')
    else:
        print('TF-IDF Vectorizer')

    for i, model in enumerate(models_count):
        count_v = vectorizers[idx].fit(X['tweets'])

        # Transforma tus datos de entrenamiento y prueba utilizando el vectorizador 
        # CountVectorizer ajustado
        X_train_count = count_v.transform(X['tweets'])
        X_test_count = count_v.transform(x['tweets'])

        # Ajusta el modelo PCA con los datos de entrenamiento
        pca.fit(X_train_count.toarray())

        # Transforma tus datos de entrenamiento y prueba utilizando el modelo PCA ajustado
        X_train_pca = pca.transform(X_train_count.toarray())
        X_test_pca = pca.transform(X_test_count.toarray())

        model.fit(X_train_pca, y_train)
        y_pred = model.predict(X_test_pca)
        print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

Count Vectorizer
Model: Logistic Regression, f1: 0.6842
Model: Random Forest, f1: 0.39
Model: Linear SVC, f1: 0.6605
Model: KNN, f1: 0.3924
TF-IDF Vectorizer
Model: Logistic Regression, f1: 0.3826
Model: Random Forest, f1: 0.3445
Model: Linear SVC, f1: 0.6054
Model: KNN, f1: 0.4092
