In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.


True

In [2]:
def preprocess_tweet_text(text:str) -> str:
    """
    Limpa e pré-processa um tweet.

    Argumentos:
    text: string - Uma string que contém o tweet a ser limpo.

    Retorna:
    string - Uma string que contém o tweet pré-processado.
    """
    # Define as expressões regulares para encontrar e substituir menções (@) e URLs
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[A-Za-z0-9./]+'
    combined_pat = r'|'.join((pat1, pat2))
    tok = WordPunctTokenizer()

    # Converte os caracteres HTML em texto usando BeautifulSoup
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()

    # Remove menções e URLs
    stripped = re.sub(combined_pat, '', souped)

    try:
        # Remove caracteres especiais
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped

    # Remove caracteres que não são letras do alfabeto
    letters_only = re.sub("[^a-zA-Z]", " ", clean)

    # Converte todas as letras para minúsculas
    lower_case = letters_only.lower()

    # Separa as palavras
    words = tok.tokenize(lower_case)
    
    # Remover stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
        
    # Unir as palavras pré-processadas em uma única string
    tweet = ' '.join(words)

    # Retorna a string com as palavras pré-processadas
    return (" ".join(words)).strip()



# Dataset 01: Sentiment140

In [3]:
path_to_dataset_01 = ''

In [None]:
df = pd.read_csv(path_to_dataset_01)

n = len(df)

# Dividindo o dataframe em dois com base no target
positive_df = df[df['category'] == 4]
negative_df = df[df['category'] == 0]

positive_samples = positive_df.sample(n=n, replace=True)
negative_samples = negative_df.sample(n=n, replace=True)

# Combinando as amostras em um único dataframe
df_sample = pd.concat([positive_samples, negative_samples])

X = df_sample['cleaned_text']
y = df_sample['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC())])

text_clf_01.fit(X_train, y_train)

y_pred = text_clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
sn.heatmap(cm, cmap='Blues', annot=True, fmt='g')

print(accuracy_score(y_test, y_pred))
print(cm)

# Dataset 02: Tweet Emotions

In [5]:
path_to_dataset_02 = ''

In [None]:
df = pd.read_csv(path_to_dataset_02)

# Dividindo o dataframe em dois com base no target
positive_df = df[df['category'] == 4]
negative_df = df[df['category'] == 0]

positive_samples = positive_df.sample(n=n, replace=True)
negative_samples = negative_df.sample(n=n, replace=True)

# Combinando as amostras em um único dataframe
df_sample = pd.concat([positive_samples, negative_samples])

X = df_sample['cleaned_text']
y = df_sample['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

text_clf = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LinearSVC())])

text_clf_02.fit(X_train, y_train)

y_pred = text_clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
sn.heatmap(cm, cmap='Blues', annot=True, fmt='g')

print(accuracy_score(y_test, y_pred))
print(cm)



# Testes

In [None]:
text = 'Paste Any Text Here'
text_clf_01.predict([preprocess_tweet_text(text)])[0]
text_clf_02.predict([preprocess_tweet_text(text)])[0]