In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('../scripts/')
from lda_preprocessing import mark_peaks, retrieve_peak_dates, nlp_pipeline
from twitter import filter_tweets, get_text, get_hashtags
import re
from tqdm.notebook import tqdm
import glob

In [2]:
# get data
hashtag_df = pd.read_json('../../data/BTW17_Twitter/hashtags/hashtag_counts.json')
peak_df = pd.read_json('../../data/BTW17_Twitter/peaks/peaks.json')

In [3]:
df = mark_peaks(hashtag_df, peak_df)
df.describe(include='all').T

  0%|          | 0/30596 [00:00<?, ?it/s]

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
index,99170.0,,,,NaT,NaT,49584.5,28628.057435,0.0,24792.25,49584.5,74376.75,99169.0
date,99170.0,120.0,2017-09-24 00:00:00,1386.0,2017-05-29,2017-09-25,,,,,,,
hashtag,99170.0,30596.0,berlin,120.0,NaT,NaT,,,,,,,
count,99170.0,,,,NaT,NaT,10.305304,84.493636,1.0,1.0,1.0,4.0,9412.0
peak,99170.0,,,,NaT,NaT,0.009307,0.096025,0.0,0.0,0.0,0.0,1.0


In [4]:
dates_df = retrieve_peak_dates(hashtag_df, df)
dates_df = dates_df[dates_df['lda_dates'].str.len()!=0].reset_index()
dates_df.describe(include='all').T

  0%|          | 0/30596 [00:00<?, ?it/s]

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
index,162.0,,,,1244.006173,2017.581307,9.0,237.25,492.0,1570.75,15246.0
hashtag,162.0,162.0,antifa,1.0,,,,,,,
lda_dates,162.0,162.0,"[2017-06-05, 2017-06-06, 2017-06-07, 2017-06-0...",1.0,,,,,,,


In [5]:
# save to json
dates_df.to_json('../../data/BTW17_Twitter/peaks/peak_dates.json')

In [11]:
%load_ext autoreload
%autoreload 2
from twitter import filter_tweets

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# set to *.json to load all
path_to_json = '../../data/BTW17_Twitter/tweets/*.json'
file_list = glob.glob(path_to_json)
lda_tweets = filter_tweets(file_list, dates_df)

  0%|          | 0/1308 [00:00<?, ?it/s]

In [18]:
# save to json
lda_tweets.to_json('../../data/BTW17_Twitter/lda/lda_tweets.json')
lda_tweets.head(3)

Unnamed: 0,created_at,text,user,extended_tweet,retweeted_status,tags
0,2017-08-02,RT @AfD_Bund: 6️⃣5️⃣.0️⃣0️⃣0️⃣ Follower auf #T...,"{'id': 738771661915344897, 'id_str': '73877166...",,{'created_at': 'Wed Aug 02 21:28:47 +0000 2017...,btw17
1,2017-08-02,RT @FraukePetry: #OSZE nimmt Sorge der #AfD er...,"{'id': 855324133, 'id_str': '855324133', 'name...",,{'created_at': 'Wed Aug 02 11:37:33 +0000 2017...,btw17
2,2017-08-02,RT @AfD_Bund: 6️⃣5️⃣.0️⃣0️⃣0️⃣ Follower auf #T...,"{'id': 1305881724, 'id_str': '1305881724', 'na...",,{'created_at': 'Wed Aug 02 21:28:47 +0000 2017...,btw17


In [20]:
lda_tweets[lda_tweets['extended_tweet']=='']

Unnamed: 0,created_at,text,user,extended_tweet,retweeted_status,tags
25438,2017-05-29,RT @GrueneBundestag: Die #Mietpreisbremse brem...,"{'id': 236737207, 'id_str': '236737207', 'name...",,{'created_at': 'Mon May 29 08:07:05 +0000 2017...,mietpreisbremse
91140,2017-07-16,RT @Eckleben: #Rentner in #Berlin glücklich + ...,"{'id': 3610063162, 'id_str': '3610063162', 'na...",,{'created_at': 'Sat Jul 15 17:12:15 +0000 2017...,berlin
117607,2017-06-17,"RT @Joerg_Meuthen: Die sogenannten ""#Grünen“ s...","{'id': 2817555377, 'id_str': '2817555377', 'na...",,{'created_at': 'Sun Jun 11 12:45:53 +0000 2017...,grünen
117608,2017-06-17,RT @AfD_Bund: Dr. Alexander #Gauland:\n»Richti...,"{'id': 867849125009645568, 'id_str': '86784912...",,{'created_at': 'Thu Jun 15 15:39:07 +0000 2017...,gauland
117609,2017-06-17,RT @AfD_Bund: Das #BAMF (Bundesamt für Migrati...,"{'id': 749574693636730880, 'id_str': '74957469...",,{'created_at': 'Fri Jun 16 12:27:23 +0000 2017...,heimat


In [21]:
# get full tweets text
lda_tweets['retweeted_status'].fillna(0, inplace=True)
lda_tweets['extended_tweet'].replace(to_replace='', value=0, inplace=True)
lda_tweets['extended_tweet'].fillna(0, inplace=True)
lda_tweets['full_text'] = lda_tweets.apply(lambda x: get_text(x['extended_tweet'], x['retweeted_status'], x['text']), axis=1)

# remove mentions
lda_tweets['full_text_processed'] = lda_tweets['full_text'].apply(lambda x: re.sub('@[A-Za-z0-9_]+','', x))

# remove numbers
lda_tweets['full_text_processed'] = lda_tweets['full_text_processed'].apply(lambda x: re.sub('[0-9]', '', x))

In [36]:
# preprocess tokens
tokens = []
tweet_list = lda_tweets['full_text_processed'].tolist()
for i in tqdm(range(len(lda_tweets))):
    tokens.append(nlp_pipeline(tweet_list[i]))

  0%|          | 0/284895 [00:00<?, ?it/s]

In [37]:
lda_tweets['tokens'] = tokens

# save preprocessed tweets
lda_tweets.to_json('../../data/BTW17_Twitter/lda/preprocessed_lda_tweets.json')
lda_tweets.head(3)

Unnamed: 0,created_at,text,user,extended_tweet,retweeted_status,tags,full_text,full_text_processed,tokens
0,2017-08-02,RT @AfD_Bund: 6️⃣5️⃣.0️⃣0️⃣0️⃣ Follower auf #T...,"{'id': 738771661915344897, 'id_str': '73877166...",0,{'created_at': 'Wed Aug 02 21:28:47 +0000 2017...,btw17,6️⃣5️⃣.0️⃣0️⃣0️⃣ Follower auf #Twitter!\nDie #...,️⃣️⃣.️⃣️⃣️⃣ Follower auf #Twitter!\nDie #AfD-P...,"[follower, twitter, finden, immer, mehr, anklang]"
1,2017-08-02,RT @FraukePetry: #OSZE nimmt Sorge der #AfD er...,"{'id': 855324133, 'id_str': '855324133', 'name...",0,{'created_at': 'Wed Aug 02 11:37:33 +0000 2017...,btw17,#OSZE nimmt Sorge der #AfD ernst und schickt T...,#OSZE nimmt Sorge der #AfD ernst und schickt T...,"[osze, nehmen, sorge, afd, ernst, schicken, te..."
2,2017-08-02,RT @AfD_Bund: 6️⃣5️⃣.0️⃣0️⃣0️⃣ Follower auf #T...,"{'id': 1305881724, 'id_str': '1305881724', 'na...",0,{'created_at': 'Wed Aug 02 21:28:47 +0000 2017...,btw17,6️⃣5️⃣.0️⃣0️⃣0️⃣ Follower auf #Twitter!\nDie #...,️⃣️⃣.️⃣️⃣️⃣ Follower auf #Twitter!\nDie #AfD-P...,"[follower, twitter, finden, immer, mehr, anklang]"


In [53]:
# evaluate preprocessing
print('original document: ')
print(lda_tweets['full_text'][120])
print('\ntokenized and lemmatized document: ')
print(lda_tweets['tokens'][120])

original document: 
Presseerklärung zu Aussagen von Martin Schulz (#SPD) 

#afd #poggenburg #schulz

https://t.co/WMZxoPKicq https://t.co/ibaYniIh59

tokenized and lemmatized document: 
['presseerklärung', 'aussage', 'martin', 'schulz', 'spd', 'afd', 'poggenburg', 'schulz']
