In [None]:
%%capture
!python -m spacy download pt_core_news_lg
import spacy

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
from unidecode import unidecode

In [None]:
dateparser = lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.000Z')

filename = '../input/twitter-ipca/data_query3.csv'
data = pd.read_csv(filename,
                   index_col=0,
                   lineterminator='\n',
                   parse_dates=['created_at'])

In [None]:
data = data[~data['text'].str.lower().str.contains(r'n.o tem pre.o')]

In [None]:
nlp = spacy.load('pt_core_news_lg')
stop_words = ['pra', 'pro', 'ta', 'q', 'd', 'p', 'c', 'n', 'ne', 'vc', 'tb', 'ai', 'so',
              'pq', 'qdo', 'ser', 'ver', 'ter', 'vir', 'ir', 'ficar', 'haver','estar']

def text_preprocess(text):
    # Filters
    text = re.sub(r'@[A-Za-z0-9$-_@.&+]+', ' ', text) # usernames
    text = re.sub(r'https?://[A-Za-z0-9./]+', ' ', text) # urls
    text = text.replace('RT', ' ') # retweet marks
    text = text.replace('\n', ' ') # line-feed marks
    
    # Spacy pipeline and filters
    text = nlp(text)
    tokens = [token.lemma_ for token in text if (not token.is_stop) &
                                                (not token.is_punct) &
                                                (not token.is_space) &
                                                (token.is_alpha)]
    
    # Join, normalize, and remove additional stop words
    text = ' '.join(tokens).lower()
    text = unidecode(text)
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
            
    return text

In [None]:
data['processed_text'] = data['text'].apply(text_preprocess)
dataset = data.reset_index(drop=True)

In [None]:
dataset.to_csv('data_query3_processed.csv')

In [None]:
# Simple bigrams maker and file writer
edges = []
for i in range(len(dataset)):
    text = dataset.loc[i, 'processed_text']
    text = text.split(' ')
    time = dataset.loc[i, 'created_at'].isoformat()
    #time = time.replace(hour=0, minute=0, second=0)
    #time = time.strftime('%Y-%m-%d %H:%M:%S')
    for j, word in enumerate(text):
        if j < (len(text) - 1):
            edges.append([word, text[j+1], time])

edges = pd.DataFrame(edges, columns=['source', 'target', 'time'])

In [None]:
edges.to_csv('query3.edges', index=False)