Import libraries

In [None]:
import gzip
import itertools
import json
import pathlib
import time

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns

You will need to run the NLTK (natural language toolkit) `download` function to download additional items for using `nltk`.

In [None]:
run_nltk_downloader = False

if run_nltk_downloader:
    nltk.download()

The data is stored in JSON (JavaScript Object Notation) format. The JSON file is compressed to save space. The following code block shows how to read the data.

In [None]:
raw_data_filepath = pathlib.Path('relevant_data.json.gz')

with gzip.GzipFile(raw_data_filepath, 'r') as fp:
    json_bytes = fp.read()
    json_str = json_bytes.decode('utf-8')
    tweet_data = json.loads(json_str)
    
print(f'The data includes {len(tweet_data)} observations.')

The data is stored as a list of lists. Each sublist has six items:
1. the datetime for the tweet,
2. the username associated with the tweet,
3. the user associated with the tweet,
4. the location associated with the tweet,
5. the language associated with the tweet, and
6. the tweet text.

The following code block prints the first entry as an example.

In [None]:
tweet_data[0]

The following code block:
1. creates a Pandas `DataFrame` with the data (named `tweet_df`),
2. converts the `Datetime` column to a datetime format,
3. creates a `Date` column with just the date extracted from the `Datetime` column,
4. creates a `Day_Name` column that specifies the day of the week that the tweet was posted.
5. creates a `Weekend` column that is `True` if the tweet was posted on a weekend, and `False` otherwise.

In [None]:
column_names = [
    'Datetime',
    'Username',
    'User',
    'Location',
    'Language',
    'Text',
]

tweet_df = pd.DataFrame(tweet_data, 
                        columns = column_names)

tweet_df['Datetime'] = pd.to_datetime(tweet_df['Datetime'], 
                                      format = '%a %b %d %H:%M:%S %z %Y')

tweet_df['Date'] = tweet_df['Datetime'].dt.date

tweet_df['Day_Name'] = tweet_df['Datetime'].dt.day_name()

weekend_mask = tweet_df['Day_Name'].isin(['Saturday', 'Sunday'])
tweet_df.loc[weekend_mask, 'Weekend'] = True
tweet_df.loc[~weekend_mask, 'Weekend'] = True

tweet_df['Hour'] = tweet_df['Datetime'].dt.hour

The following code block shows the number of missing values in the data (expressed as a proportion of the total number of observations).

In [None]:
tweet_df.isna().sum()/len(tweet_df)

The following code block plots the number of tweets collected each day (recall this is just a sample).

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (15, 6))

temp = tweet_df.groupby(['Date'])['Text'].count().reset_index()
temp = temp.rename(columns = {'Text': '# Tweets'})

sns.lineplot(
    x = 'Date', 
    y = '# Tweets',
    data = temp,
)

plt.show()

The following code block generates a plot that shows how the average number of tweets posted a day varies by day of the week.

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (12, 6))

temp = tweet_df.groupby(['Date', 'Day_Name'])['Text'].count().reset_index()
temp = temp.groupby(['Day_Name'])['Text'].mean().reset_index()
temp = temp.rename(columns = {'Text': '# Tweets (Avg)'})

sns.barplot(
    x = 'Day_Name', 
    y = '# Tweets (Avg)',
    data = temp,
    edgecolor = 'k',
    order = ['Monday', 
             'Tuesday',
             'Wednesday',
             'Thursday',
             'Friday',
             'Saturday',
             'Sunday',
            ]
)

plt.show()

The following code block generates a plot that shows how the average number of tweets posted a day varies by hour of day.

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (12, 6))

temp = tweet_df.groupby(['Date', 'Hour'])['Text'].count().reset_index()
temp = temp.groupby(['Hour'])['Text'].mean().reset_index()
temp = temp.rename(columns = {'Text': '# Tweets (Avg)'})

sns.barplot(
    x = 'Hour', 
    y = '# Tweets (Avg)',
    data = temp,
    edgecolor = 'k',
)

plt.show()

The following code block shows how we can use `nltk` to get a frequency distribution for the data.

In [None]:
start_time = time.time()

tweets = tweet_df['Text'].str.lower().tolist()
tknzr = nltk.tokenize.TweetTokenizer()
tweet_tokens = [tknzr.tokenize(tweet) for tweet in tweets]
all_tokens = list(itertools.chain.from_iterable(tweet_tokens))
freq_dist = nltk.FreqDist(all_tokens)

end_time = time.time()
print(f'Tokenization and frequency distribution construction took {np.round(end_time - start_time, 2)} seconds.')

The following code block shows the top 30 words.

In [None]:
freq_dist.most_common(30)