In [None]:
### importing the libraries

import pandas as pd
import numpy as np
import os

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import datetime

In [None]:
SENTIMENT140_DATA_DIR = 'Sentiment140.data' # sentiment 140 data set saved here
DG_DATA_DIR = 'D_G data' # D&G data set saved here
OUTPUT_DIR = 'output' # intermediate output and models saved here
FIGURES_DIR = 'figures' # figures saved here

# Read in data with predictions

In [None]:
df_chopsticks = pd.read_csv(os.path.join(DG_DATA_DIR, "dolcegabbana_chopsticks_mentions_daily_expanded_with_predictions.csv"))
df_all = pd.read_csv(os.path.join(DG_DATA_DIR, "dolcegabbana_mentions_daily_all_with_predictions.csv"), lineterminator='\n')

In [None]:
# Changing object type column to datetime
df_chopsticks['interval_start_date'] = pd.to_datetime(df_chopsticks['interval_start_date'])
df_chopsticks['created_at'] = pd.to_datetime(df_chopsticks['created_at'])
df_chopsticks['date'] = df_chopsticks['created_at'].dt.date # Creating new column with just the date
df_chopsticks['date'] = pd.to_datetime(df_chopsticks['date'])

# get week start date to aggregate over weeks
df_chopsticks['week_start_date'] = df_chopsticks['date'].apply(lambda x: x - datetime.timedelta(days=x.weekday()))
df_chopsticks['week_start_date'] = df_chopsticks['week_start_date'].dt.date
df_chopsticks.head()

In [None]:
# Changing object type column to datetime
df_all['interval_start_date'] = pd.to_datetime(df_all['interval_start_date'])
df_all['created_at'] = pd.to_datetime(df_all['created_at'])
df_all['date'] = df_all['created_at'].dt.date # Creating new column with just the date
df_all['date'] = pd.to_datetime(df_all['date'])

# get week start date to aggregate over weeks
df_all['week_start_date'] = df_all['date'].apply(lambda x: x - datetime.timedelta(days=x.weekday()))
df_all['week_start_date'] = df_all['week_start_date'].dt.date
df_all.head()

In [None]:
# split into positive and negative dataframes using ensemble of classifiers (svc, rfc, lr, nb)
df_all_pos = df_all[df_all['pred'] == 1]
df_all_neg = df_all[df_all['pred'] == -1]

df_chopsticks_pos = df_chopsticks[df_chopsticks['pred'] == 1]
df_chopsticks_neg = df_chopsticks[df_chopsticks['pred'] == -1]

In [None]:
# aggregate data and get aggregated statistics
pos_byweek_all = df_all_pos.groupby(by=['week_start_date']).agg({'retweet_count': ['sum', 'count']}).reset_index() # for positive tweets, get total retweets each week (sum of num retweets column over each week)
pos_byweek_all.columns = ['date', 'total_retweets', 'total_tweets']

neg_byweek_all = df_all_neg.groupby(by=['week_start_date']).agg({'retweet_count': ['sum', 'count']}).reset_index() # for negative tweets, get total retweets each week (sum of num retweets column over each week)
neg_byweek_all.columns = ['date', 'total_retweets', 'total_tweets']

# aggregate data and get aggregated statistics
pos_byweek_chopsticks = df_chopsticks_pos.groupby(by=['week_start_date']).agg({'retweet_count': ['sum', 'count']}).reset_index() # for positive tweets, get total retweets each week (sum of num retweets column over each week)
pos_byweek_chopsticks.columns = ['date', 'total_retweets', 'total_tweets']

neg_byweek_chopsticks = df_chopsticks_neg.groupby(by=['week_start_date']).agg({'retweet_count': ['sum', 'count']}).reset_index() # for negative tweets, get total retweets each week (sum of num retweets column over each week)
neg_byweek_chopsticks.columns = ['date', 'total_retweets', 'total_tweets']

# Plot total retweets over time (positive vs negative)

In [None]:
# plot number of retweets over time (aggregated over each week) for positive and negative tweets
fig, ax = plt.subplots(figsize=(30, 8))
ax.plot(pos_byweek_all["date"], pos_byweek_all['total_retweets'], c='blue', label='Positive tweets')
ax.plot(neg_byweek_all["date"], neg_byweek_all['total_retweets'], c='red', label='Negative tweets')

# Major ticks every 6 months.
fmt_half_year = mdates.MonthLocator(interval=6)
ax.xaxis.set_major_locator(fmt_half_year)

# Minor ticks every month.
fmt_month = mdates.MonthLocator()
ax.xaxis.set_minor_locator(fmt_month)

# Text in the x axis will be displayed in 'YYYY-mm' format.
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

# Round to nearest half of year.
datemin = np.datetime64(datetime.date(2009, 7, 1), 'm')
datemax = np.datetime64(datetime.date(2022, 1, 1), 'm')
ax.set_xlim(datemin, datemax)

# Format the coords message box, i.e. the numbers displayed as the cursor moves
# across the axes within the interactive GUI.
ax.format_xdata = mdates.DateFormatter('%Y-%m')
ax.format_ydata = lambda x: f'${x:.2f}'  # Format the price.
ax.grid(True)

# Label the axes
ax.set(xlabel='Date', ylabel='Weekly total retweet count')
#ax.set_ylim([0, 10000]) # scale the y-axis range

# Rotates and right aligns the x labels, and moves the bottom of the
# axes up to make room for them.
fig.autofmt_xdate()
plt.legend() # add the legend

plt.savefig(os.path.join(FIGURES_DIR, 'D&G_all_total_retweets_over_time_pos_vs_neg.jpg'), dpi=300)

plt.show()

In [None]:
# plot number of retweets over time (aggregated over each week) for positive and negative tweets
fig, ax = plt.subplots(figsize=(30, 8))
ax.plot(pos_byweek_chopsticks["date"], pos_byweek_chopsticks['total_retweets'], c='blue', label='Positive tweets')
ax.plot(neg_byweek_chopsticks["date"], neg_byweek_chopsticks['total_retweets'], c='red', label='Negative tweets')

# Major ticks every 6 months.
fmt_half_year = mdates.MonthLocator(interval=6)
ax.xaxis.set_major_locator(fmt_half_year)

# Minor ticks every month.
fmt_month = mdates.MonthLocator()
ax.xaxis.set_minor_locator(fmt_month)

# Text in the x axis will be displayed in 'YYYY-mm' format.
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

# Round to nearest half of year.
datemin = np.datetime64(datetime.date(2018, 7, 1), 'm')
datemax = np.datetime64(datetime.date(2022, 1, 1), 'm')
ax.set_xlim(datemin, datemax)

# Format the coords message box, i.e. the numbers displayed as the cursor moves
# across the axes within the interactive GUI.
ax.format_xdata = mdates.DateFormatter('%Y-%m')
ax.format_ydata = lambda x: f'${x:.2f}'  # Format the price.
ax.grid(True)

# Label the axes
ax.set(xlabel='Date', ylabel='Weekly total retweet count')
#ax.set_ylim([0, 10000]) # scale the y-axis range

# Rotates and right aligns the x labels, and moves the bottom of the
# axes up to make room for them.
fig.autofmt_xdate()
plt.legend() # add the legend

plt.savefig(os.path.join(FIGURES_DIR, 'D&G_chopsticks_total_retweets_over_time_pos_vs_neg.jpg'), dpi=300)

plt.show()

# Plot total tweets over time (positive vs negative)

In [None]:
# plot number of retweets over time (aggregated over each week) for positive and negative tweets
fig, ax = plt.subplots(figsize=(30, 8))
ax.plot(pos_byweek_all["date"], pos_byweek_all['total_tweets'], c='blue', label='Positive tweets')
ax.plot(neg_byweek_all["date"], neg_byweek_all['total_tweets'], c='red', label='Negative tweets')

# Major ticks every 6 months.
fmt_half_year = mdates.MonthLocator(interval=6)
ax.xaxis.set_major_locator(fmt_half_year)

# Minor ticks every month.
fmt_month = mdates.MonthLocator()
ax.xaxis.set_minor_locator(fmt_month)

# Text in the x axis will be displayed in 'YYYY-mm' format.
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

# Round to nearest half of year.
datemin = np.datetime64(datetime.date(2009, 7, 1), 'm')
datemax = np.datetime64(datetime.date(2022, 1, 1), 'm')
ax.set_xlim(datemin, datemax)

# Format the coords message box, i.e. the numbers displayed as the cursor moves
# across the axes within the interactive GUI.
ax.format_xdata = mdates.DateFormatter('%Y-%m')
ax.format_ydata = lambda x: f'${x:.2f}'  # Format the price.
ax.grid(True)

# Label the axes
ax.set(xlabel='Date', ylabel='Weekly total tweet count')
#ax.set_ylim([0, 10000]) # scale the y-axis range

# Rotates and right aligns the x labels, and moves the bottom of the
# axes up to make room for them.
fig.autofmt_xdate()
plt.legend() # add the legend

plt.savefig(os.path.join(FIGURES_DIR, 'D&G_all_total_tweets_over_time_pos_vs_neg.jpg'), dpi=300)

plt.show()

In [None]:
# plot number of retweets over time (aggregated over each week) for positive and negative tweets
fig, ax = plt.subplots(figsize=(30, 8))
ax.plot(pos_byweek_chopsticks["date"], pos_byweek_chopsticks['total_tweets'], c='blue', label='Positive tweets')
ax.plot(neg_byweek_chopsticks["date"], neg_byweek_chopsticks['total_tweets'], c='red', label='Negative tweets')

# Major ticks every 6 months.
fmt_half_year = mdates.MonthLocator(interval=6)
ax.xaxis.set_major_locator(fmt_half_year)

# Minor ticks every month.
fmt_month = mdates.MonthLocator()
ax.xaxis.set_minor_locator(fmt_month)

# Text in the x axis will be displayed in 'YYYY-mm' format.
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

# Round to nearest half of year.
datemin = np.datetime64(datetime.date(2018, 7, 1), 'm')
datemax = np.datetime64(datetime.date(2022, 1, 1), 'm')
ax.set_xlim(datemin, datemax)

# Format the coords message box, i.e. the numbers displayed as the cursor moves
# across the axes within the interactive GUI.
ax.format_xdata = mdates.DateFormatter('%Y-%m')
ax.format_ydata = lambda x: f'${x:.2f}'  # Format the price.
ax.grid(True)

# Label the axes
ax.set(xlabel='Date', ylabel='Weekly total tweet count')
#ax.set_ylim([0, 10000]) # scale the y-axis range

# Rotates and right aligns the x labels, and moves the bottom of the
# axes up to make room for them.
fig.autofmt_xdate()
plt.legend() # add the legend

plt.savefig(os.path.join(FIGURES_DIR, 'D&G_chopsticks_total_tweets_over_time_pos_vs_neg.jpg'), dpi=300)

plt.show()