## Imports

In [1]:
!git clone https://github.com/CCogS-Mx/text-preprocessing.git

fatal: destination path 'text-preprocessing' already exists and is not an empty directory.


In [1]:
import sys 
sys.path.append('Scripts/')
sys.path.append('text-preprocessing/script/')

In [2]:
import os
import pandas as pd
import numpy as np

# Import classes
import text_preprocessing as tp
import feature_extraction as fe
import robust_classic_model as rcm
import cleaning_twitter_data as ctd

# Algoritmos de ML
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split

# Vectorizadores
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qk_le\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qk_le\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qk_le\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data

In [3]:
training = pd.read_csv('Data/homomex_training.csv').drop('index', axis=1)
test = pd.read_csv('Data/track1_test_no_labels.csv').drop('Unnamed: 0', axis=1)

In [4]:
prep = tp.Preprocessing('spanish')

In [5]:
training_p = prep.main_preprocess(data=training, 
                                  column='tweets', 
                                  tweet=True, 
                                  tweet_tags=True, 
                                  remove_stop_words=False, 
                                  lemmatize=False, 
                                  translate_emojis=True)

In [21]:
training_p.label = training_p.label.fillna('NR')
dictionary_list = [{'P': 0, 'NP': 1, 'NR': 2}]
training_p.label.replace(list(dictionary_list[0].keys()), 
            list(dictionary_list[0].values()), 
            inplace=True)


In [7]:
test_p = prep.main_preprocess(data=test, 
                                  column='content', 
                                  tweet=True, 
                                  tweet_tags=True, 
                                  remove_stop_words=False, 
                                  lemmatize=False, 
                                  translate_emojis=True)

## Improving best classifiers with training data only

In [13]:
from sklearn.metrics import f1_score
from sklearn.calibration import CalibratedClassifierCV

In [11]:
vectorizer = CountVectorizer(ngram_range = (1,1), lowercase = False)
oth_feats_vectorizer = CountVectorizer(ngram_range = (1,1), lowercase = False)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(training_p, 
                                                    training_p['label'], 
                                                    test_size=0.05, 
                                                    random_state=42)

In [13]:
# Train
f_train = fe.FeatureExtraction(data = X_train, 
                                text_column = 'tweets', 
                                lemma = True, 
                                pos = True, 
                                tag = True,                                
                                other_features = True,
                                vectorizer = vectorizer,
                                oth_feats_vectorizer = oth_feats_vectorizer,
                                language = 'es')

X = f_train.add_features_to_test()

In [14]:
# Test
f_train = fe.FeatureExtraction(data = X_test, 
                                text_column = 'tweets', 
                                lemma = True, 
                                pos = True, 
                                tag = True,                                
                                other_features = True,
                                vectorizer = vectorizer,
                                oth_feats_vectorizer = oth_feats_vectorizer,
                                language = 'es')

x = f_train.add_features_to_test()

In [15]:
y_train = y_train.tolist()
y_test = y_test.tolist()

### Unigramas

#### TP-LPTO

In [20]:
models_count = [LogisticRegression(C=1.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=50,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=0.1, max_iter=10000, penalty='l2', random_state=42)), 
          MultinomialNB(), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
models_tfidf = [LogisticRegression(C=10.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=50,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=0.1, max_iter=1000, penalty='l2', random_state=42)), 
          MultinomialNB(), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
model_names = ['Logistic Regression', 'Random Forest', 'Linear SVC', 'Multinomial Naive Bayes', 'KNN']


In [21]:
for i in range(2):
    if i == 0:
        print('Count Vectorizer')
        for i, model in enumerate(models_count):
            count_v = CountVectorizer(ngram_range = (1,1), lowercase = False)
            X_Train = count_v.fit_transform(X['tweets'])
            X_Test = count_v.transform(x['tweets'])
            model.fit(X_Train, y_train)
            y_pred = model.predict(X_Test)
            print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')
    else:
        print('TF-IDF Vectorizer')
        for i, model in enumerate(models_tfidf):
            count_v = TfidfVectorizer(ngram_range = (1,1), lowercase = False)
            X_Train = count_v.fit_transform(X['tweets'])
            X_Test = count_v.transform(x['tweets'])
            model.fit(X_Train, y_train)
            y_pred = model.predict(X_Test)
            print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

Count Vectorizer
Model: Logistic Regression, f1: 0.7486
Model: Random Forest, f1: 0.5469




Model: Linear SVC, f1: 0.7566
Model: Multinomial Naive Bayes, f1: 0.5735
Model: KNN, f1: 0.3919
TF-IDF Vectorizer
Model: Logistic Regression, f1: 0.7002
Model: Random Forest, f1: 0.5157
Model: Linear SVC, f1: 0.6962
Model: Multinomial Naive Bayes, f1: 0.3055
Model: KNN, f1: 0.4473


#### TP

In [7]:
data_prep = pd.read_csv('Data/task1_preprocessed.csv')
data_prep.label = data_prep.label.fillna('NR')
dictionary_list = [{'P': 0, 'NP': 1, 'NR': 2}]
data_prep.label.replace(list(dictionary_list[0].keys()), 
            list(dictionary_list[0].values()), 
            inplace=True)

X_train, X_test, y_train, y_test = train_test_split(data_prep, 
                                                    data_prep['label'], 
                                                    test_size=0.05, 
                                                    random_state=42)

In [9]:
y_train = y_train.tolist()
y_test = y_test.tolist()

In [15]:
models_count = [LogisticRegression(C=1.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=50,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=0.1, max_iter=10000, penalty='l2', random_state=42)), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
models_tfidf = [LogisticRegression(C=10.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=100,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=1.0, max_iter=1000, penalty='l2', random_state=42)), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
model_names = ['Logistic Regression', 'Random Forest', 'Linear SVC', 'KNN']

In [16]:
for i in range(2):
    if i == 0:
        print('Count Vectorizer')
        for i, model in enumerate(models_count):
            count_v = CountVectorizer(ngram_range = (1,1), lowercase = False)
            X_Train = count_v.fit_transform(X_train['tweets'])
            X_Test = count_v.transform(X_test['tweets'])
            model.fit(X_Train, y_train)
            y_pred = model.predict(X_Test)
            print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')
    else:
        print('TF-IDF Vectorizer')
        for i, model in enumerate(models_tfidf):
            count_v = TfidfVectorizer(ngram_range = (1,1), lowercase = False)
            X_Train = count_v.fit_transform(X_train['tweets'])
            X_Test = count_v.transform(X_test['tweets'])
            model.fit(X_Train, y_train)
            y_pred = model.predict(X_Test)
            print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

Count Vectorizer
Model: Logistic Regression, f1: 0.7069
Model: Random Forest, f1: 0.6849
Model: Linear SVC, f1: 0.6879
Model: KNN, f1: 0.577
TF-IDF Vectorizer
Model: Logistic Regression, f1: 0.6978
Model: Random Forest, f1: 0.6502
Model: Linear SVC, f1: 0.6924
Model: KNN, f1: 0.6231


### Unigramas y bigramas

#### TP-LPTO

In [22]:
models_count = [LogisticRegression(C=10.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=10,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=0.1, max_iter=10000, penalty='l2', random_state=42)), 
          MultinomialNB(), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
models_tfidf = [LogisticRegression(C=1000.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=10,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=0.1, max_iter=1000, penalty='l2', random_state=42)), 
          MultinomialNB(), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
model_names = ['Logistic Regression', 'Random Forest', 'Linear SVC', 'Multinomial Naive Bayes', 'KNN']

In [23]:
for i in range(2):
    if i == 0:
        print('Count Vectorizer')
        for i, model in enumerate(models_count):
            count_v = CountVectorizer(ngram_range = (1,2), lowercase = False)
            X_Train = count_v.fit_transform(X['tweets'])
            X_Test = count_v.transform(x['tweets'])
            model.fit(X_Train, y_train)
            y_pred = model.predict(X_Test)
            print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')
    else:
        print('TF-IDF Vectorizer')
        for i, model in enumerate(models_tfidf):
            count_v = TfidfVectorizer(ngram_range = (1,2), lowercase = False)
            X_Train = count_v.fit_transform(X['tweets'])
            X_Test = count_v.transform(x['tweets'])
            model.fit(X_Train, y_train)
            y_pred = model.predict(X_Test)
            print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

Count Vectorizer
Model: Logistic Regression, f1: 0.7197
Model: Random Forest, f1: 0.4927
Model: Linear SVC, f1: 0.698
Model: Multinomial Naive Bayes, f1: 0.465
Model: KNN, f1: 0.4285
TF-IDF Vectorizer
Model: Logistic Regression, f1: 0.7006
Model: Random Forest, f1: 0.4887
Model: Linear SVC, f1: 0.5804
Model: Multinomial Naive Bayes, f1: 0.2537
Model: KNN, f1: 0.464


#### TP

In [19]:
models_count = [LogisticRegression(C=10.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=10,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=0.1, max_iter=10000, penalty='l2', random_state=42)), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
models_tfidf = [LogisticRegression(C=1000.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=10,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=10.0, max_iter=1000, penalty='l2', random_state=42)), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
model_names = ['Logistic Regression', 'Random Forest', 'Linear SVC', 'KNN']

In [21]:
for i in range(2):
    if i == 0:
        print('Count Vectorizer')
        for i, model in enumerate(models_count):
            count_v = CountVectorizer(ngram_range = (1,2), lowercase = False)
            X_Train = count_v.fit_transform(X_train['tweets'])
            X_Test = count_v.transform(X_test['tweets'])
            model.fit(X_Train, y_train)
            y_pred = model.predict(X_Test)
            print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')
    else:
        print('TF-IDF Vectorizer')
        for i, model in enumerate(models_tfidf):
            count_v = TfidfVectorizer(ngram_range = (1,2), lowercase = False)
            X_Train = count_v.fit_transform(X_train['tweets'])
            X_Test = count_v.transform(X_test['tweets'])
            model.fit(X_Train, y_train)
            y_pred = model.predict(X_Test)
            print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

Count Vectorizer
Model: Logistic Regression, f1: 0.7453
Model: Random Forest, f1: 0.5788
Model: Linear SVC, f1: 0.7314
Model: KNN, f1: 0.5276
TF-IDF Vectorizer
Model: Logistic Regression, f1: 0.7169
Model: Random Forest, f1: 0.5984
Model: Linear SVC, f1: 0.7288
Model: KNN, f1: 0.6553


## Unigramas, bigramas y trigramas

### TP

In [26]:
models_count = [LogisticRegression(C=100.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=10,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=0.1, max_iter=10000, penalty='l2', random_state=42)), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
models_tfidf = [LogisticRegression(C=1000.0, max_iter=1000, penalty='l2', random_state=42), 
          RandomForestClassifier(max_depth=100, n_estimators=10,random_state=42), 
          CalibratedClassifierCV(LinearSVC(C=1000.0, max_iter=100000, penalty='l2', random_state=42)), 
          KNeighborsClassifier(n_neighbors=3, weights='distance')]
model_names = ['Logistic Regression', 'Random Forest', 'Linear SVC', 'KNN']

In [27]:
for i in range(2):
    if i == 0:
        print('Count Vectorizer')
        for i, model in enumerate(models_count):
            count_v = CountVectorizer(ngram_range = (1,3), lowercase = False)
            X_Train = count_v.fit_transform(X_train['tweets'])
            X_Test = count_v.transform(X_test['tweets'])
            model.fit(X_Train, y_train)
            y_pred = model.predict(X_Test)
            print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')
    else:
        print('TF-IDF Vectorizer')
        for i, model in enumerate(models_tfidf):
            count_v = TfidfVectorizer(ngram_range = (1,3), lowercase = False)
            X_Train = count_v.fit_transform(X_train['tweets'])
            X_Test = count_v.transform(X_test['tweets'])
            model.fit(X_Train, y_train)
            y_pred = model.predict(X_Test)
            print(f'Model: {model_names[i]}, f1: {round(f1_score(y_test, y_pred, average="macro"), 4)}')

Count Vectorizer
Model: Logistic Regression, f1: 0.7248
Model: Random Forest, f1: 0.4825
Model: Linear SVC, f1: 0.7312
Model: KNN, f1: 0.4635
TF-IDF Vectorizer
Model: Logistic Regression, f1: 0.7125
Model: Random Forest, f1: 0.5204
Model: Linear SVC, f1: 0.7449
Model: KNN, f1: 0.6527


## Best classifier

In [8]:
model = CalibratedClassifierCV(LinearSVC(C=0.1, max_iter=10000, penalty='l2', random_state=42))
vectorizer = CountVectorizer(ngram_range = (1,1), lowercase = False)
oth_feats_vectorizer = CountVectorizer(ngram_range = (1,1), lowercase = False)

In [9]:
# Train
f_train = fe.FeatureExtraction(data = training_p, 
                                text_column = 'tweets', 
                                lemma = True, 
                                pos = True, 
                                tag = True,                                
                                other_features = True,
                                vectorizer = vectorizer,
                                oth_feats_vectorizer = oth_feats_vectorizer,
                                language = 'es')

X_train = f_train.add_features_to_test()
y_train = X_train.label.tolist()

In [10]:
# Test
f_test = fe.FeatureExtraction(data = test_p, 
                                text_column = 'content', 
                                lemma = True, 
                                pos = True, 
                                tag = True,                                
                                other_features = True,
                                vectorizer = vectorizer,
                                oth_feats_vectorizer = oth_feats_vectorizer,
                                language = 'es')

X_test = f_test.add_features_to_test()

In [11]:
count_v = CountVectorizer(ngram_range = (1,1), lowercase = False)
X_Train = count_v.fit_transform(X_train['tweets'])
X_Test = count_v.transform(X_test['content'])
model.fit(X_Train, y_train)

In [12]:
# Iterar sobre las filas de un dataframe
y_pred = []
for test in X_Test:
    proba = model.predict_proba(test)
    y_pred.append(proba.argmax(axis=1)[0])

In [13]:
list(dictionary_list[0])[0]

'P'

In [22]:
# crar dataframe con 3 columnas: task, id y label separados por tabulador
results = pd.DataFrame()
results['TaskName'] = ["LGBTphobiaDetectionMultiClass"]*len(y_pred)
results['IdentifierOfATweet'] = [str(i + 1) for i in range(len(y_pred))]
results['Class'] = [list(dictionary_list[0])[i] for i in y_pred]

In [23]:
results.head()

Unnamed: 0,TaskName,IdentifierOfATweet,Class
0,LGBTphobiaDetectionMultiClass,1,NP
1,LGBTphobiaDetectionMultiClass,2,NP
2,LGBTphobiaDetectionMultiClass,3,NP
3,LGBTphobiaDetectionMultiClass,4,NP
4,LGBTphobiaDetectionMultiClass,5,


In [26]:
import csv
results.to_csv('Results/results_task1.txt', sep='\t', index=False, header=False, quoting=csv.QUOTE_ALL, escapechar=" ")


In [None]:
TP-LPTO-1GC-GS-LR

Text Preprocessing (TP)
We add the following features: Lemmatization, POS, TAGS and Oher features (LPTO)
We vectorize the text by count for unigrams (1GC)
We search the best hyperparams for different models by using a Grid Search (GS), and the best result we obtained was Logistic Regression (LR)

Centro de Investigación en Computación, Instituto Politécnico Nacional