In [1]:
# necessary imports
import os, json
import pandas as pd
import numpy as np
import glob
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
import sys
sys.path.append('../scripts/')
from twitter import get_hashtags, get_text
from lda_preprocessing import nlp_pipeline
import ast
import re

In [3]:
# read lda dates for hashtags
dates_df = pd.read_csv('../../data/BTW17_Twitter/peaks/hashtags_dates.csv')
dates_df = dates_df[~(dates_df['lda_dates']=='[]')].reset_index()
dates_df.drop(columns=['Unnamed: 0', 'index'], axis=1, inplace=True)
dates_df.head()

Unnamed: 0,hashtag,lda_dates
0,afghanistan,"['2017-08-22', '2017-08-23', '2017-08-24', '20..."
1,armut,"['2017-08-27', '2017-08-28', '2017-08-29', '20..."
2,bayern,"['2017-09-16', '2017-09-17', '2017-09-18', '20..."
3,berlin,"['2017-09-21', '2017-09-22', '2017-09-23', '20..."
4,bildung,"['2017-09-15', '2017-09-16', '2017-09-17', '20..."


In [4]:
# set to *.json to load all
path_to_json = '../../data/BTW17_Twitter/tweets/*.json'
file_list = glob.glob(path_to_json)
hashtag_list = dates_df['hashtag'].tolist()
lda_dates_list = dates_df['lda_dates'].tolist()

created_at_list = []
id_list = []
text_list = []
user_list = []
extended_tweet_list = []
retweeted_status_list = []
tags_list = []

# iterate through files create output csv
for index in tqdm(range(len(file_list))):
    file = file_list[index]
    with open(file, 'r') as f:
        
        # read df
        df = pd.read_json(f)
        # apply function get hashtags
        df['tags'] = df['entities'].apply(get_hashtags)
        
        # explode tags to rows and drop na values
        df = df.explode('tags')
        df = df[df['tags'].notna()]
        
        # change datetime to date and lower all hashtags
        df['created_at'] = pd.to_datetime(df['created_at']).dt.date
        df['created_at'] = df['created_at'].apply(str)
        df['tags'] = df['tags'].apply(lambda x: str(x).lower())
                
        for i in range(len(dates_df)):
            hashtag = hashtag_list[i]
            lda_dates = ast.literal_eval(lda_dates_list[i])
            temp_df = df[(df['tags']==hashtag)&(df['created_at'].isin(lda_dates))]
            
            if len(temp_df)==0:
                pass
            else:
                # append data to lists
                created_at_list.append(temp_df['created_at'].tolist())
                id_list.append(temp_df['id'].tolist())
                text_list.append(temp_df['text'].tolist())
                user_list.append(temp_df['user'].tolist())
                extended_tweet_list.append(temp_df['extended_tweet'].tolist())
                retweeted_status_list.append(temp_df['retweeted_status'].tolist())
                tags_list.append(temp_df['tags'].tolist())

  0%|          | 0/1308 [00:00<?, ?it/s]

In [5]:
# create df
output_df = pd.DataFrame(data={'created_at': created_at_list, 'id': id_list, 'text': text_list,
                               'user': user_list, 'extended_tweet': extended_tweet_list,
                               'retweeted_status': retweeted_status_list, 'tags': tags_list})

# explode df after setting fake column
output_df['A'] = 1
output_df = output_df.set_index(['A']).apply(pd.Series.explode).reset_index()
output_df.drop(columns='A', inplace=True)

output_df.head()

Unnamed: 0,created_at,id,text,user,extended_tweet,retweeted_status,tags
0,2017-08-02,892868367195025408,RT @Beatrix_vStorch: Ja zum #Diesel. https://t...,"{'id': 2746361571, 'id_str': '2746361571', 'na...",,{'created_at': 'Wed Aug 02 16:23:23 +0000 2017...,diesel
1,2017-08-02,892890742905081856,RT @Beatrix_vStorch: Ja zum #Diesel. https://t...,"{'id': 4379225363, 'id_str': '4379225363', 'na...",,{'created_at': 'Wed Aug 02 16:23:23 +0000 2017...,diesel
2,2017-08-03,892911669633966080,#Diesel #Dobrindt's schützende Hand über Autok...,"{'id': 22900494, 'id_str': '22900494', 'name':...",,,diesel
3,2017-08-25,901109158300315648,"RT @fdp: Selbstbestimmung darf keine Frage, so...","{'id': 22564113, 'id_str': '22564113', 'name':...",,{'created_at': 'Fri Aug 25 14:03:53 +0000 2017...,denkenwirneu
4,2017-08-25,901109469568008192,"RT @fdp: Selbstbestimmung darf keine Frage, so...","{'id': 710897145767927808, 'id_str': '71089714...",,{'created_at': 'Fri Aug 25 14:03:53 +0000 2017...,denkenwirneu


In [6]:
# save to csv
path_file = '../../data/BTW17_Twitter/lda/lda_tweets.csv'
output_df.to_csv(path_file)

In [7]:
output_df['retweeted_status'].fillna(0, inplace=True)
output_df['extended_tweet'].fillna(0, inplace=True)
output_df['full_text'] = output_df.apply(lambda x: get_text(x['extended_tweet'], x['retweeted_status'], x['text']), axis=1)

In [8]:
# remove mentions
output_df['full_text_processed'] = output_df['full_text'].apply(lambda x: re.sub('@[A-Za-z0-9_]+','', x))

# remove numbers
output_df['full_text_processed'] = output_df['full_text_processed'].apply(lambda x: re.sub('[0-9]', '', x))

In [9]:
# preprocess tokens
tokens = []
tweet_list = output_df['full_text_processed'].tolist()
for i in tqdm(range(len(output_df))):
    tokens.append(lemmatize(tweet_list[i]))

  0%|          | 0/62986 [00:00<?, ?it/s]

In [11]:
output_df['tokens'] = tokens

# save preprocessed tweets
output_df.to_csv('../../data/BTW17_Twitter/lda/preprocessed_lda_tweets.csv')

In [38]:
# check if it worked good
print('original document: ')
print(output_df['full_text'][100])
print('\ntokenized and lemmatized document: ')
print(output_df['tokens'][100])

original document: 
"Wir werden der soziale Oppositionsführer bleiben. Und wir werden den Regierenden Beine machen." @SWagenknecht #btw17

tokenized and lemmatized document: 
['soziale', 'oppositionsführer', 'bleiben', 'regierende', 'bein', 'btw']
