In [1]:
import numpy as np
import pandas as pd
import ast
from bs4 import BeautifulSoup

The processed tweets data is already saved in 'tweets.pkl'. You can get the same result either by reading the pickle file or run through the notebook, which may take 5-10 minutes. To run the notebook, keep the data folder 'TwitterTrends_data' in the same parent folder as the project folder.

In [2]:
# read pickle file
# df = pd.read_pickle('../../data/tweets.pkl')

Read trending topics

In [3]:
# read trends.csv
trends = pd.read_csv("../../TwitterTrends_data/trends.csv",
                     header=0,
                     usecols=[1,2,3,4],
                     names=['filename','trending_date','topic','type'],
                     dtype={'trending_date': str})
trends['trending_date'] = trends['trending_date'].apply(pd.to_datetime)

Read all the tweet objects and collect useful information, more specifically:

1. remove non-English tweets and topics
2. keep user_id and user_followers_count from column 'user'
3. keep a list of hashtags from column 'entities'
4. keep id, text and hashtags from column 'retweeted_status'
5. extract text from 'source' column
6. convert created_at to datetime format

In [4]:
# create a list for data frames
df_list = []
for filename in trends.filename:
    # read csv file
    df = pd.read_csv(f"../../TwitterTrends_data/full_tweets/{filename}.csv",lineterminator='\n',
                     dtype={'id': str,'in_reply_to_status_id_str': str,'in_reply_to_user_id_str': str})
    
    # remove non English tweets (if tweets in English were less than 60%, remove topic)
    if sum(df.lang=='en')/df.lang.count()<0.6:
        continue
    # keep only English tweets
    df = df[df.lang=='en']
    
    # remove topics with less than 100 tweets
    if len(df)<100:
        continue
    
    # for user information, keep only user_id and follower_count
    df['user'] = df['user'].apply(ast.literal_eval)
    df['user_followers_count'] = df['user'].apply(lambda x:x['followers_count'])
    df['user_id'] = df['user'].apply(lambda x:x['id']).astype('str')
    
    # create a list of hashtags used in the tweet from entities
    df['hashtags'] = df['entities'].apply(
        lambda x: [hashtag['text'] for hashtag in ast.literal_eval(x)['hashtags']])
    
    # retrieve original tweet text and id from retweeted_status
    if 'retweeted_status' in df.columns:       
        df['retweeted_status'] = df['retweeted_status'].apply(
            lambda x: ast.literal_eval(x) if isinstance(x,str) else x)
        df['original_tweet_id'] = df['retweeted_status'].apply(
            lambda x:x['id_str'] if isinstance(x,dict) else x).astype('str') 
        df['original_tweet_text'] = df['retweeted_status'].apply(
            lambda x:x['text'] if isinstance(x,dict) else x)
        df['original_tweet_hashtags'] = df['retweeted_status'].apply(
            lambda x: [hashtag['text'] for hashtag in x['entities']['hashtags']]
            if isinstance(x,dict) and x['entities']['hashtags'] else np.nan)
    # some tweet objects do not have a column of retweeted_status
    else:
        df['original_tweet_id'] = np.nan
        df['original_tweet_text'] = np.nan
        df['original_tweet_hashtags'] = np.nan
    
    # source
    df['source'] = df['source'].apply(lambda x: BeautifulSoup(x).get_text())
    
    # convert created_at to datetime
    df['created_at'] = df['created_at'].apply(pd.to_datetime)
    
    # save in_reply_to_status_id and in_reply_to_user_id as string
    df['in_reply_to_status_id'] = df['in_reply_to_status_id_str']
    df['in_reply_to_user_id'] = df['in_reply_to_user_id_str']
    
    # filename
    df['filename'] = filename
    columns_to_keep = ['filename','created_at','hashtags','favorite_count','id','user_id',
                       'user_followers_count','retweet_count', 'in_reply_to_status_id',
                       'in_reply_to_user_id','source','text','original_tweet_id',
                       'original_tweet_text','original_tweet_hashtags']
    
    # append to a list of dataframes
    df_list.append(df[columns_to_keep])
df = pd.concat(df_list)

Merge dataframes of tweets and trending topics

In [5]:
df = df.merge(trends,on='filename').drop(columns=['filename'])

In [6]:
# save as pickle file for fast retrieval
# df.to_pickle('../../data/tweets.pkl')