In [80]:
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

df = pd.read_csv('../../data/merged_data_sentiment.csv')

# if a row has no Platform, it's from Twitter
df['Platform'] = df['Platform'].fillna('Twitter')
df['Platform'].value_counts()


Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.



youtube    45089
Reddit     19319
Twitter     3692
Name: Platform, dtype: int64

In [81]:
df.shape

(68100, 14)

In [82]:
# get mean sentiment for each comment
def avg_sent(df):
  for i, row in df.iterrows():
    sent = row['Sentiment'].split(',')
    sent = [float(s) for s in sent]
    df.at[i, 'Avg_Sentiment'] = sum(sent) / len(sent)
avg_sent(df)

In [83]:
# filter out everything before 2019
# df = df[df['Date'] >= '2019-01-01']

In [84]:
# get month from date
print(df.columns)
df['Month'] = pd.to_datetime(df['Date']).dt.to_period('M').astype(str)

# get mean sentiment for each month and number of data points
df = df.groupby(['Month', 'Platform']).agg({'Avg_Sentiment': 'mean', 'ID': 'count'}).reset_index()

Index(['ID', 'Content', 'User', 'Date', 'Location', 'Reactions', 'N_Children',
       'Post Title', 'Platform', 'meta', 'lang', 'Subreddit', 'Unnamed: 0',
       'Sentiment', 'Avg_Sentiment'],
      dtype='object')



Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.



In [85]:
# show line chart of sentiment over time for each platform (by month)

# plot each platform on the same chart, one line for each
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=df[df['Platform'] == 'youtube']['Month'],
                         y=df[df['Platform'] == 'youtube']['Avg_Sentiment'],
                         name='YouTube', line=dict(color='red')),
              secondary_y=True)
fig.add_trace(go.Scatter(x=df[df['Platform'] == 'Twitter']['Month'],
                         y=df[df['Platform'] == 'Twitter']['Avg_Sentiment'],
                         name='Twitter', line=dict(color='blue')),
              secondary_y=False)
fig.add_trace(go.Scatter(x=df[df['Platform'] == 'Reddit']['Month'],
                         y=df[df['Platform'] == 'Reddit']['Avg_Sentiment'],
                         name='Reddit', line=dict(color='green')),
              secondary_y=True)
fig.update_layout(title_text="Sentiment Over Time by Platform")
fig.show()


In [86]:
# slice chart from 2019 onwards
recent_df = df[df['Month'] >= '2019-01']

# plot each platform on the same chart, one line for each
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=recent_df[recent_df['Platform'] == 'youtube']['Month'],
                          y=recent_df[recent_df['Platform'] == 'youtube']['Avg_Sentiment'],
                          name='YouTube', line=dict(color='red')),
              secondary_y=True)
fig.add_trace(go.Scatter(x=recent_df[recent_df['Platform'] == 'Twitter']['Month'],
                          y=recent_df[recent_df['Platform'] == 'Twitter']['Avg_Sentiment'],
                          name='Twitter', line=dict(color='blue')),
              secondary_y=False)
fig.add_trace(go.Scatter(x=recent_df[recent_df['Platform'] == 'Reddit']['Month'],
                          y=recent_df[recent_df['Platform'] == 'Reddit']['Avg_Sentiment'],
                          name='Reddit', line=dict(color='green')),
              secondary_y=True)
fig.update_layout(title_text="Sentiment Over Time by Platform (2019 onwards)")
fig.show()

In [87]:
# pie chart of number of posts by platform
fig = go.Figure(data=[go.Pie(labels=df['Platform'], values=df['ID'])])
fig.update_layout(title_text="Number of Posts by Platform")
fig.show()

In [88]:
# pie chart of how many comments there are from each year
df = pd.read_csv('../../data/merged_data_sentiment.csv')
df['Year'] = pd.to_datetime(df['Date']).dt.to_period('Y').astype(str)
df = df.groupby(['Year']).agg({'ID': 'count'}).reset_index()
fig = go.Figure(data=[go.Pie(labels=df['Year'], values=df['ID'])])
fig.update_layout(title_text="Number of Posts by Year")
fig.show()


Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.


Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.



In [89]:
# how much of the data is from before 2019?
df[df['Year'] < '2019']['ID'].sum() / df['ID'].sum()

0.05600826690286389

In [90]:
# date of the earliest post from Reddit
df = pd.read_csv('../../data/merged_data_sentiment.csv')
df = df[df['Platform'] == 'Reddit']
df['Date'] = pd.to_datetime(df['Date'])
df['Date'].min()


Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.



Timestamp('2022-09-27 00:00:00')

In [91]:
# average sentiment on each platform
df = pd.read_csv('../../data/merged_data_sentiment.csv')
# only data from 27 September 2022 onwards
df = df[df['Date'] >= '2022-09-27']

avg_sent(df)
df['Platform'] = df['Platform'].fillna('Twitter')
df = df.groupby(['Platform']).agg({'Avg_Sentiment': 'mean'}).reset_index()
df


Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,Platform,Avg_Sentiment
0,Reddit,-0.050162
1,Twitter,0.092659
2,youtube,-0.03461


In [92]:
# line chart of YouTube and Reddit sentiment over 2022 and onwards
df = pd.read_csv('../../data/merged_data_sentiment.csv')
df = df[df['Date'] >= '2022-09-27']
avg_sent(df)
df = df.groupby(['Platform', 'Date']).agg({'Avg_Sentiment': 'mean'}).reset_index()
df['Month'] = pd.to_datetime(df['Date']).dt.to_period('M').astype(str)
df = df.groupby(['Platform', 'Month']).agg({'Avg_Sentiment': 'mean'}).reset_index()

# plot each platform on the same chart, one line for each
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=df[df['Platform'] == 'youtube']['Month'],
                          y=df[df['Platform'] == 'youtube']['Avg_Sentiment'],
                          name='YouTube', line=dict(color='red')),
              secondary_y=True)
fig.add_trace(go.Scatter(x=df[df['Platform'] == 'Reddit']['Month'],
                          y=df[df['Platform'] == 'Reddit']['Avg_Sentiment'],
                          name='Reddit', line=dict(color='green')),
              secondary_y=True)
fig.update_layout(
    title_text="Sentiment Over Time by Platform (27 September 2022 onwards)",
    xaxis_title="Month",
    yaxis_title="Average Sentiment")
fig.show()



Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.


Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.

