# Ideias
Procurar palavras que aparecem mais em comum com outras (e.g Bolsonaro, Lula, presidente, etc.)

In [None]:
from nltk.corpus import stopwords
from tqdm import tqdm

import numpy as np
import pandas as pd
import warnings
import spacy

warnings.filterwarnings("ignore")
pt_br_stopwords = stopwords.words('portuguese')
nlp = spacy.load('pt_core_news_lg')

In [None]:
# Filler words to be removed
fillers = ['tá', 'ir', 'né', 'eis', 'aí', 'ai', 'lá', 'é', 'ah', 'aqui', 'então', 'dar', 
          'assim', 'ficar', 'ter', 'poxa', 'gente', 'blá', 'olhar', 'tipo', 'nado', 'bom', 'ó', 
           'ali', 'to', 'tô']

In [None]:
data = pd.read_csv('../data/processed/cleaned_transcripted_dataset.csv')
print(data.shape)
data.head()

# Preprocessing

In [None]:
data['full_text'] = data['video_desc'].fillna('') + ' ' + data['transcription'] + ' ' + data['stickers_on_video'].fillna('')
data['full_text'] = data['full_text'].str.lower()

data.iloc[1]

# Lemmatization

In [None]:
lemmas = []
for doc in tqdm(nlp.pipe(data['full_text'], disable=["tok2vec", "tagger", "parser", "attribute_ruler"], n_process=-1), total=len(data['full_text'])):
    doc_lemmas = []
    for token in doc:
        str_token = str(token)
        if not token.is_punct and str_token not in pt_br_stopwords:
            doc_lemmas.append(token.lemma_)
    lemmas.append(' '.join(doc_lemmas))

data['full_text_lemmas'] = lemmas

# Removing fillers

In [None]:
for filler in fillers:
    filler = ' {} '.format(filler)
    data['full_text_lemmas'] = data['full_text_lemmas'].str.replace(filler, ' ')

# Saving dataset

In [None]:
data.to_csv('../data/processed/preprocessed_lemma_dataset.csv', index=False)