In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('../scripts/')
from lda_preprocessing import mark_peaks, retrieve_peak_dates, nlp_pipeline
from twitter import filter_tweets, get_text, get_hashtags
import re
from tqdm.notebook import tqdm
import glob

In [2]:
# get data
hashtag_df = pd.read_json('../../data/BTW17_Twitter/hashtags/hashtag_counts.json')
peak_df = pd.read_json('../../data/BTW17_Twitter/peaks/peaks.json')

In [3]:
df = mark_peaks(hashtag_df, peak_df)
df.describe(include='all').T

  0%|          | 0/30596 [00:00<?, ?it/s]

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
index,99170.0,,,,NaT,NaT,49584.5,28628.057435,0.0,24792.25,49584.5,74376.75,99169.0
date,99170.0,120.0,2017-09-24 00:00:00,1386.0,2017-05-29,2017-09-25,,,,,,,
hashtag,99170.0,30596.0,berlin,120.0,NaT,NaT,,,,,,,
count,99170.0,,,,NaT,NaT,10.305304,84.493636,1.0,1.0,1.0,4.0,9412.0
peak,99170.0,,,,NaT,NaT,0.009307,0.096025,0.0,0.0,0.0,0.0,1.0


In [4]:
dates_df = retrieve_peak_dates(hashtag_df, df)
dates_df = dates_df[dates_df['lda_dates'].str.len()!=0].reset_index()
dates_df.describe(include='all').T

  0%|          | 0/30596 [00:00<?, ?it/s]

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
index,162.0,,,,1244.006173,2017.581307,9.0,237.25,492.0,1570.75,15246.0
hashtag,162.0,162.0,h端tter,1.0,,,,,,,
lda_dates,162.0,38.0,"[2017-09-20, 2017-09-21, 2017-09-22, 2017-09-2...",22.0,,,,,,,


In [5]:
# save to json
dates_df.to_json('../../data/BTW17_Twitter/peaks/peak_dates.json')

In [6]:
# set to *.json to load all
path_to_json = '../../data/BTW17_Twitter/tweets/*.json'
file_list = glob.glob(path_to_json)
lda_tweets = filter_tweets(file_list, dates_df)

  0%|          | 0/1308 [00:00<?, ?it/s]

In [7]:
# save to json
lda_tweets.to_json('../../data/BTW17_Twitter/lda/lda_tweets.json')

In [8]:
# get full tweets text
lda_tweets['retweeted_status'].fillna(0, inplace=True)
lda_tweets['extended_tweet'].fillna(0, inplace=True)
lda_tweets['full_text'] = lda_tweets.apply(lambda x: get_text(x['extended_tweet'], x['retweeted_status'], x['text']), axis=1)

# remove mentions
lda_tweets['full_text_processed'] = lda_tweets['full_text'].apply(lambda x: re.sub('@[A-Za-z0-9_]+','', x))

# remove numbers
lda_tweets['full_text_processed'] = lda_tweets['full_text_processed'].apply(lambda x: re.sub('[0-9]', '', x))

In [10]:
# preprocess tokens
tokens = []
tweet_list = lda_tweets['full_text_processed'].tolist()
for i in tqdm(range(len(lda_tweets))):
    tokens.append(nlp_pipeline(tweet_list[i]))

  0%|          | 0/62986 [00:00<?, ?it/s]

In [11]:
lda_tweets['tokens'] = tokens

# save preprocessed tweets
lda_tweets.to_json('../../data/BTW17_Twitter/lda/preprocessed_lda_tweets.json')

In [12]:
# evaluate preprocessing
print('original document: ')
print(lda_tweets['full_text'][100])
print('\ntokenized and lemmatized document: ')
print(lda_tweets['tokens'][100])

original document: 
"Wir werden der soziale Oppositionsf端hrer bleiben. Und wir werden den Regierenden Beine machen." @SWagenknecht #btw17

tokenized and lemmatized document: 
['soziale', 'oppositionsf端hrer', 'bleiben', 'regierende', 'bein', 'btw']
