# Preprocessing tweets to NLP Analysis

### Synchronizing with Team Collaboration Folder in Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install emot vader-multi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Importing fundamental libraries

In [None]:
import pandas as pd
import nltk 
import string
import re
from emot import emo_unicode
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Dropping lines with duplicate username

In [None]:
# August
lula_tweets_august = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Capturas brutas/lula_tweets_agosto.csv')
bolsonaro_tweets_august = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Capturas brutas/bolsonaro_tweets_agosto.csv')

# September
lula_tweets_september = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Capturas brutas/lula_tweets_setembro.csv')
bolsonaro_tweets_september = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Capturas brutas/bolsonaro_tweets_setembro.csv')

# October
lula_tweets_october = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Capturas brutas/lula_tweets_outubro.csv')
bolsonaro_tweets_october = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Capturas brutas/bolsonaro_tweets_outubro.csv')

In [None]:
def Data_Transformer(dataframe):

    """Function that separates the date column to fit in %Y-%m-%d format and as datetime"""

    new = dataframe['data'].str.split(" ", n = 1, expand = True)

    dataframe['date'] = new[0]
    dataframe['excluir'] = new[1]

    dataframe.drop(columns = ['data', 'excluir'], inplace = True)

    dataframe['date'] = pd.to_datetime(dataframe['date'], format = '%Y-%m-%d')

    first_column = dataframe.pop('date')
    dataframe.insert(0, 'date', first_column)

In [None]:
# August
Data_Transformer(lula_tweets_august)
Data_Transformer(bolsonaro_tweets_august)

# September
Data_Transformer(lula_tweets_september)
Data_Transformer(bolsonaro_tweets_september)

# October
Data_Transformer(lula_tweets_october)
Data_Transformer(bolsonaro_tweets_october)

In [None]:
# August
lula_tweets_august.drop_duplicates(subset = ['username'], inplace = True)
bolsonaro_tweets_august.drop_duplicates(subset = ['username'], inplace = True)

# September
lula_tweets_september.drop_duplicates(subset = ['username'], inplace = True)
bolsonaro_tweets_september.drop_duplicates(subset = ['username'], inplace = True)

# October
lula_tweets_october.drop_duplicates(subset = ['username'], inplace = True)
bolsonaro_tweets_october.drop_duplicates(subset = ['username'], inplace = True)

In [None]:
# August
lula_tweets_august.to_csv('lula_tweets_august.csv', encoding = 'utf-8', index = False)
bolsonaro_tweets_august.to_csv('bolsonaro_tweets_august.csv', encoding = 'utf-8', index = False)

# September
lula_tweets_september.to_csv('lula_tweets_september.csv', encoding = 'utf-8', index = False)
bolsonaro_tweets_september.to_csv('bolsonaro_tweets_september.csv', encoding = 'utf-8', index = False)

# October
lula_tweets_october.to_csv('lula_tweets_october.csv', encoding = 'utf-8', index = False)
bolsonaro_tweets_october.to_csv('bolsonaro_tweets_october.csv', encoding = 'utf-8', index = False)


### Datasets - automatic classification

In [None]:
# Lula
lula_august_auto = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Datasets sem duplicatas/lula_tweets_august.csv')
lula_september_auto = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Datasets sem duplicatas/lula_tweets_september.csv')
lula_october_auto = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Datasets sem duplicatas/lula_tweets_october.csv')

# Bolsonaro
bolsonaro_august_auto = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Datasets sem duplicatas/bolsonaro_tweets_august.csv')
bolsonaro_september_auto = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Datasets sem duplicatas/bolsonaro_tweets_september.csv')
bolsonaro_october_auto = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Datasets sem duplicatas/bolsonaro_tweets_october.csv')

In [None]:
# Classificações automáticas (união august e september)

# Lula
lula_auto = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Datasets - classificação automática/lula_auto.csv')

# Bolsonaro
bolsonaro_auto = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Datasets - classificação automática/bolsonaro_auto.csv')

### Datasets - manual classification

In [None]:
# Lula
lula_man = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Datasets - classificação manual/classificacao_manual_lula.csv')

# Bolsonaro
bolsonaro_man = pd.read_csv('/content/drive/MyDrive/Projeto Tera: Eleição/Datasets - classificação manual/bolsonaro_man.csv')

### Declaring functions

In [None]:
# Remove emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # Emoticons
                           u"\U0001F300-\U0001F5FF" # Símbolos e pictogramas
                           u"\U0001F680-\U0001F6FF" # Símbolos de transporte e mapa
                           u"\U0001F1E0-\U0001F1FF" # Bandeiras (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


# Limpando os tweets
def tweets_cleaner(tweet):
    tweet = tweet.lower() # Converter para minúscula
    tweet = re.sub('[\s]+', ' ', tweet) # Remove espaços em branco adicionais
    tweet= re.sub(r'@[A-Za-z0-9]+', '', tweet) # Remove menções
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # Substitui hashtags por palavras
    tweet = re.sub(r'RT[\s]+', '', tweet) # Remove RT
    tweet = re.sub(r'https?:\/\/\s+', '', tweet) # Remove hyperlink
    tweet = re.sub(r'http', '', tweet) # Remove hyperlink
    tweet = re.sub(r':+', '', tweet) # Remove : 
    tweet = re.sub(r'--+', '', tweet) # Remove :
    tweet  = "".join([char for char in tweet if char not in string.punctuation]) # Remove pontuação
    tweet = re.sub('[0-9]+', '', tweet) # Remove pontuação
    tweet = tweet.strip('\'"') # Apara
    return tweet

# Substitui repetições
def replace_duplicates(s):
    padronizar = re.compile(r"(.)\1{1,}", re.DOTALL)
    return padronizar.sub(r"\1\1", s)


# Tokeniza
def tokenizador(text):
    text = re.split('\W+', text)
    return text


# Remove stopwords
stopword = nltk.corpus.stopwords.words('portuguese')

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text


# Stemming
ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text


# Lemmatiza
wn = nltk.WordNetLemmatizer()

def lemmatizador(text):
    text = [wn.lemmatize(word) for word in text]
    return text


# Sentiment
def sentiment(compound):
    if compound < 0:
        return -1
    elif compound == 0:
        return 0
    else:
        return 1


# Criar colunas
def create_columns(df):
    df['tweet_no_emoji'] = df['tweet'].apply(lambda x: remove_emoji(x))
    df['cleaned_tweet'] = df['tweet_no_emoji'].apply(lambda x: tweets_cleaner(x))
    df['cleaned_tweet'] = df['cleaned_tweet'].apply(lambda x: replace_duplicates(x))
    df['tweet_tokenize'] = df['cleaned_tweet'].apply(lambda x: tokenizador(x))
    df['tweet_no_stopwords'] = df['tweet_tokenize'].apply(lambda x: remove_stopwords(x))
    df['tweet_stemmize'] = df['tweet_no_stopwords'].apply(lambda x: stemming(x))
    df['tweet_lemmatize'] = df['tweet_stemmize'].apply(lambda x: lemmatizador(x))
    df['hashtag'] = df['tweet'].str.findall(r'#.*?(?=\s|$)')

In [None]:
lula_man.drop(labels=['Unnamed: 0', 'Unnamed: 0.1'],
              axis=1,
              inplace=True)

In [None]:
lula_man = lula_man.rename({'Classificação Manual': 'sentiment_manual'}, axis=1)

In [None]:
create_columns(lula_man)

In [None]:
create_columns(bolsonaro_man)

In [None]:
lula_man['hashtag']=lula_man['hashtag'].replace('\]','',regex=True).astype(str)
lula_man['hashtag']=lula_man['hashtag'].replace('\[','',regex=True).astype(str)
lula_man['hashtag']=lula_man['hashtag'].replace('\'','',regex=True).astype(str)
lula_man['hashtag']=lula_man['hashtag'].replace('\#','',regex=True).astype(str)
lula_man['tweet_sem_stopwords']=lula_man['tweet_sem_stopwords'].replace('\[','',regex=True).astype(str)
lula_man['tweet_sem_stopwords']=lula_man['tweet_sem_stopwords'].replace('\]','',regex=True).astype(str)
lula_man['tweet_sem_stopwords']=lula_man['tweet_sem_stopwords'].replace('\'','',regex=True).astype(str)

In [None]:
bolsonaro_man['hashtag']=bolsonaro_man['hashtag'].replace('\]','',regex=True).astype(str)
bolsonaro_man['hashtag']=bolsonaro_man['hashtag'].replace('\[','',regex=True).astype(str)
bolsonaro_man['hashtag']=bolsonaro_man['hashtag'].replace('\'','',regex=True).astype(str)
bolsonaro_man['hashtag']=bolsonaro_man['hashtag'].replace('\#','',regex=True).astype(str)
bolsonaro_man['tweet_sem_stopwords']=bolsonaro_man['tweet_sem_stopwords'].replace('\[','',regex=True).astype(str)
bolsonaro_man['tweet_sem_stopwords']=bolsonaro_man['tweet_sem_stopwords'].replace('\]','',regex=True).astype(str)
bolsonaro_man['tweet_sem_stopwords']=bolsonaro_man['tweet_sem_stopwords'].replace('\'','',regex=True).astype(str)

In [None]:
# Criando colunas para Lula
create_columns(lula_august_auto)
create_columns(lula_september_auto)
create_columns(lula_october_auto)

# Criando colunas para Bolsonaro
create_columns(bolsonaro_august_auto)
create_columns(bolsonaro_september_auto)
create_columns(bolsonaro_october_auto)

In [None]:
# Lulas's Sentiment
lula_august_auto['sentiment'] = lula_august_auto['polarity'].apply(sentiment)
lula_september_auto['sentiment'] = lula_september_auto['polarity'].apply(sentiment)
lula_october_auto['sentiment'] = lula_october_auto['polarity'].apply(sentiment)

# Bolsonaro's Sentiment
bolsonaro_august_auto['sentiment'] = bolsonaro_august_auto['polarity'].apply(sentiment)
bolsonaro_september_auto['sentiment'] = bolsonaro_september_auto['polarity'].apply(sentiment)
bolsonaro_october_auto['sentiment'] = bolsonaro_october_auto['polarity'].apply(sentiment)

## Improving automatic optimization with VADER

In [None]:
results_vader_lula = pd.DataFrame(results_vader_lula)

lula_auto['sentiment_vader'] = lula_auto['compound'].apply(sentiment)

In [None]:
analyzer = SentimentIntensityAnalyzer()
results_vader_bolsonaro = []

for tweet in bolsonaro_auto['tweet']:
    analise = analyzer.polarity_scores(tweet)
    resultados_vader_bolsonaro.append(analise)

results_vader_bolsonaro = pd.DataFrame(results_vader_bolsonaro)

bolsonaro_auto['sentiment_vader'] = bolsonaro_auto['compound'].apply(sentiment)

### Data Cleaning

In [None]:
lula_august_auto['hashtag']=lula_august_auto['hashtag'].replace('\]','',regex=True).astype(str)
lula_august_auto['hashtag']=lula_august_auto['hashtag'].replace('\[','',regex=True).astype(str)
lula_august_auto['hashtag']=lula_august_auto['hashtag'].replace('\'','',regex=True).astype(str)
lula_august_auto['hashtag']=lula_august_auto['hashtag'].replace('\#','',regex=True).astype(str)
lula_august_auto['tweet_sem_stopwords']=lula_august_auto['tweet_sem_stopwords'].replace('\[','',regex=True).astype(str)
lula_august_auto['tweet_sem_stopwords']=lula_august_auto['tweet_sem_stopwords'].replace('\]','',regex=True).astype(str)
lula_august_auto['tweet_sem_stopwords']=lula_august_auto['tweet_sem_stopwords'].replace('\'','',regex=True).astype(str)

In [None]:
lula_september_auto['hashtag']=lula_september_auto['hashtag'].replace('\]','',regex=True).astype(str)
lula_september_auto['hashtag']=lula_september_auto['hashtag'].replace('\[','',regex=True).astype(str)
lula_september_auto['hashtag']=lula_september_auto['hashtag'].replace('\'','',regex=True).astype(str)
lula_september_auto['hashtag']=lula_september_auto['hashtag'].replace('\#','',regex=True).astype(str)
lula_september_auto['tweet_sem_stopwords']=lula_september_auto['tweet_sem_stopwords'].replace('\[','',regex=True).astype(str)
lula_september_auto['tweet_sem_stopwords']=lula_september_auto['tweet_sem_stopwords'].replace('\]','',regex=True).astype(str)
lula_september_auto['tweet_sem_stopwords']=lula_september_auto['tweet_sem_stopwords'].replace('\'','',regex=True).astype(str)

In [None]:
lula_october_auto['hashtag']=lula_october_auto['hashtag'].replace('\]','',regex=True).astype(str)
lula_october_auto['hashtag']=lula_october_auto['hashtag'].replace('\[','',regex=True).astype(str)
lula_october_auto['hashtag']=lula_october_auto['hashtag'].replace('\'','',regex=True).astype(str)
lula_october_auto['hashtag']=lula_october_auto['hashtag'].replace('\#','',regex=True).astype(str)
lula_october_auto['tweet_sem_stopwords']=lula_october_auto['tweet_sem_stopwords'].replace('\[','',regex=True).astype(str)
lula_october_auto['tweet_sem_stopwords']=lula_october_auto['tweet_sem_stopwords'].replace('\]','',regex=True).astype(str)
lula_october_auto['tweet_sem_stopwords']=lula_october_auto['tweet_sem_stopwords'].replace('\'','',regex=True).astype(str)

In [None]:
bolsonaro_august_auto['hashtag']=bolsonaro_august_auto['hashtag'].replace('\]','',regex=True).astype(str)
bolsonaro_august_auto['hashtag']=bolsonaro_august_auto['hashtag'].replace('\[','',regex=True).astype(str)
bolsonaro_august_auto['hashtag']=bolsonaro_august_auto['hashtag'].replace('\'','',regex=True).astype(str)
bolsonaro_august_auto['hashtag']=bolsonaro_august_auto['hashtag'].replace('\#','',regex=True).astype(str)
bolsonaro_august_auto['tweet_sem_stopwords']=bolsonaro_august_auto['tweet_sem_stopwords'].replace('\[','',regex=True).astype(str)
bolsonaro_august_auto['tweet_sem_stopwords']=bolsonaro_august_auto['tweet_sem_stopwords'].replace('\]','',regex=True).astype(str)
bolsonaro_august_auto['tweet_sem_stopwords']=bolsonaro_august_auto['tweet_sem_stopwords'].replace('\'','',regex=True).astype(str)

In [None]:
bolsonaro_september_auto['hashtag']=bolsonaro_september_auto['hashtag'].replace('\]','',regex=True).astype(str)
bolsonaro_september_auto['hashtag']=bolsonaro_september_auto['hashtag'].replace('\[','',regex=True).astype(str)
bolsonaro_september_auto['hashtag']=bolsonaro_september_auto['hashtag'].replace('\'','',regex=True).astype(str)
bolsonaro_september_auto['hashtag']=bolsonaro_september_auto['hashtag'].replace('\#','',regex=True).astype(str)
bolsonaro_september_auto['tweet_sem_stopwords']=bolsonaro_september_auto['tweet_sem_stopwords'].replace('\[','',regex=True).astype(str)
bolsonaro_september_auto['tweet_sem_stopwords']=bolsonaro_september_auto['tweet_sem_stopwords'].replace('\]','',regex=True).astype(str)
bolsonaro_september_auto['tweet_sem_stopwords']=bolsonaro_september_auto['tweet_sem_stopwords'].replace('\'','',regex=True).astype(str)

In [None]:
bolsonaro_october_auto['hashtag']=bolsonaro_october_auto['hashtag'].replace('\]','',regex=True).astype(str)
bolsonaro_october_auto['hashtag']=bolsonaro_october_auto['hashtag'].replace('\[','',regex=True).astype(str)
bolsonaro_october_auto['hashtag']=bolsonaro_october_auto['hashtag'].replace('\'','',regex=True).astype(str)
bolsonaro_october_auto['hashtag']=bolsonaro_october_auto['hashtag'].replace('\#','',regex=True).astype(str)
bolsonaro_october_auto['tweet_sem_stopwords']=bolsonaro_october_auto['tweet_sem_stopwords'].replace('\[','',regex=True).astype(str)
bolsonaro_october_auto['tweet_sem_stopwords']=bolsonaro_october_auto['tweet_sem_stopwords'].replace('\]','',regex=True).astype(str)
bolsonaro_october_auto['tweet_sem_stopwords']=bolsonaro_october_auto['tweet_sem_stopwords'].replace('\'','',regex=True).astype(str)

### Transforming into csv

In [None]:
# august
lula_august_auto.to_csv('lula_august_auto.csv', encoding = 'utf-8', index = False)
bolsonaro_august_auto.to_csv('bolsonaro_august_auto.csv', encoding = 'utf-8', index = False)

# september
lula_september_auto.to_csv('lula_september_auto.csv', encoding = 'utf-8', index = False)
bolsonaro_september_auto.to_csv('bolsonaro_september_auto.csv', encoding = 'utf-8', index = False)

# october
lula_october_auto.to_csv('lula_october_auto.csv', encoding = 'utf-8', index = False)
bolsonaro_october_auto.to_csv('bolsonaro_october_auto.csv', encoding = 'utf-8', index = False)

In [None]:
lula_man.to_csv('lula_man.csv', encoding = 'utf-8', index = False)

In [None]:
bolsonaro_man.to_csv('bolsonaro_man.csv', encoding = 'utf-8', index = False)