In [None]:
!pip install pycaret -q

In [None]:
import warnings
import itertools
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
plt.style.use('seaborn')
import pandas as pd
import statsmodels.api as sm
import matplotlib
from pylab import rcParams
import glob
from sklearn.metrics import mean_absolute_error
from pycaret.regression import *
import matplotlib.dates as mdates
import textwrap
plt.style.use('seaborn')

# matplotlib.rcParams['axes.labelsize'] = 14
# matplotlib.rcParams['xtick.labelsize'] = 12
# matplotlib.rcParams['ytick.labelsize'] = 12
# matplotlib.rcParams['text.color'] = 'k'

In [None]:
def getDailySentiment(dataframe, column):
    # Get daily total tweets
    dailyTweets = (dataframe.groupby([dataframe[column]])['tweet'].count()).to_frame('totalDailyTweets')
    
    # Group the tweets by day, sentiment and get their count
    sentiments_per_day = (dataframe.groupby([column,'sentiment'])['tweet'].count()).to_frame('sentimentTweetCount')
    
    # Get the sentiment with maximum number of tweets in that day. Change sentiment from dataframe index to dataframe column.
    dailySentiment = sentiments_per_day.loc[sentiments_per_day.groupby([column], sort=False)['sentimentTweetCount'].idxmax()][['sentimentTweetCount']]
    dailySentiment.reset_index(level=['sentiment'], inplace=True)
    
    # Merge total tweets per hour and hourly sentiment dataframe
    dailySentiment = dailySentiment.merge(dailyTweets, on=column, how='inner')
    
    # Calculate hourly score wrt to the maximum number of tweets for a sentiment
    dailyScore = []
    for index, row in dailySentiment.iterrows():
        if row['sentiment'] == 'positive':
            dailyScore.append(np.round(row['sentimentTweetCount']/row['totalDailyTweets'], 3))
        elif row['sentiment'] == 'negative':
            dailyScore.append(-1*np.round(row['sentimentTweetCount']/row['totalDailyTweets'], 3))
        else:
            dailyScore.append(0)
    dailySentiment['dailySentimentScore'] = dailyScore
    
    return dailySentiment

In [None]:
# Constants
PHARMA_PATH = '../input/healthjmir/combined data/pharma companies'
GOVT_INSTITUTES_PATH = '../input/healthjmir/combined data/public health agencies'
NGO_PATH = '../input/healthjmir/combined data/ngo'

In [None]:
df = pd.concat([pd.read_csv(f, sep=',') for f in glob.glob(GOVT_INSTITUTES_PATH + "/*.csv")],ignore_index=True)
# Divide as per dates
pre_covid_df = df.loc[df['created_at'] <= '2020-02-26 23:59:59']
print(pre_covid_df.shape)

during_covid_df = df.loc[df['created_at'] >= '2020-02-26 00:00:00']
print(during_covid_df.shape)

In [None]:
during_covid_df['created_at'] = during_covid_df['created_at'].str[:-6]
during_covid_df['created_at'] = pd.to_datetime(during_covid_df['created_at'])
during_covid_df['date'] = during_covid_df['created_at'].dt.date

# Sort by datetime ascending
during_covid_df = during_covid_df.sort_values(by='date')

# Drop extra columns
extra_cols = ['id','retweet_count','like_count','reply_count','quote_count','expanded_url','language','possibly_sensitive','in_reply_to_user_id','positive','negative','neutral']
during_covid_df.drop(extra_cols, axis=1, inplace=True)

dailySentiment_df = getDailySentiment(during_covid_df, 'date')
dailySentiment_df.index = pd.to_datetime(dailySentiment_df.index, errors='coerce')

In [None]:
train_len = int(dailySentiment_df.shape[0]*0.7)
dailySentiment_df.iloc[train_len]

In [None]:
# Resample the data to the month starting
y = dailySentiment_df['dailySentimentScore'].resample('MS').mean()
y.plot(figsize=(15,6))
plt.show()

In [None]:
monthlySentiment = pd.DataFrame({'date':y.index,'sentimentScore':y.values})

In [None]:
monthlySentiment.columns

In [None]:
monthlySentiment['year'] = monthlySentiment['date'].dt.year
monthlySentiment['month'] = monthlySentiment['date'].dt.month

In [None]:
train = monthlySentiment[monthlySentiment['date']<='2021-06-01']
test = monthlySentiment[monthlySentiment['date']>'2021-06-01']

In [None]:
train.shape, test.shape

In [None]:
s = setup(data = train, test_data = test, target = 'sentimentScore', fold_strategy = 'timeseries', numeric_features = ['year','month'], fold = 5, session_id = 123)

In [None]:
best = compare_models(sort = 'MAE')

In [None]:
prediction_holdout = predict_model(best)

In [None]:
# generate predictions on the original dataset
predictions = predict_model(best, data=test)
plt.figure(figsize=(14, 8))
fig, ax = plt.subplots(figsize=(14, 8))
plt.plot(monthlySentiment.date, monthlySentiment.sentimentScore, label='Observed')
plt.plot(predictions.date, predictions.Label, label='One-step ahead Forecast')
plt.legend(loc=2, prop={'size':20})
plt.xlabel('Date',fontsize='23')
plt.ylabel('Sentiment Score', fontsize='23')
myFmt = mdates.DateFormatter('%b\n%Y')
plt.gca().xaxis.set_major_formatter(myFmt)
plt.xticks(fontsize=23)
plt.yticks(fontsize=23)
plt.axvspan('2021-07-01','2021-12-01',color='grey',alpha=0.2)
plt.savefig('public-health-agencies.pdf', bbox_inches='tight')
plt.savefig('public-health-agencies.png', bbox_inches='tight')
plt.show()