In [1]:
# import packages
import pandas as pd
import statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols
import tweepy
import json
import os
from collections import Counter
import numpy as np
from collections import defaultdict
from matplotlib import pyplot as plt
import time
from datetime import datetime
from dateutil.parser import parse

In [2]:
# load json file of tweets from twitter crawler into list of dicts
def load_json(file_name):
    result=[]
    with open(file_name, 'r') as fp:
        line=fp.readline()
        while line:
            tjson=json.loads(line) #decode json
            result.append({
                "Date":tjson["created_at"],
                "id":tjson["id_str"],
                "user_name":tjson["user"]["name"], 
                "favorite_count":tjson["favorite_count"], #number of favorites
                "retweet_count":tjson["retweet_count"], #get number of retweets
                "user_id_str":tjson["user"]["id_str"], #get user id_str
                "text":tjson["text"]
            })

            line=fp.readline()
    return result

# load tweets from getoldtweets3 into list of dicts
def load_old_json(file_name):
    result=[]
    with open(file_name, 'r') as fp:
        line=fp.readline()
        while line:
            tjson=json.loads(line) #decode json
            result.append({
                "Date":tjson["Date"],
                "id":tjson["id"],
                "favorite_count":tjson["favorite_count"], #number of favorites
                "retweet_count":tjson["retweet_count"], #number of retweets
                "text":tjson["text"]
            })
            line=fp.readline()
    return result


In [3]:
# load tweet data
#data_list=load_json("tweets.json")

#data_list=load_old_json("old_tweets.json")
filename="./../Data/Gatherd Data/old_tweets.json"
data_list=load_old_json(filename)
# create a df
df=pd.DataFrame(data=data_list)
print('done')

#converting date into more readable form 
Date_list= df["Date"].values
dates = []
for date in Date_list:
    dates.append(parse(date).date())
df["Date"]=dates
print(df)

done
              Date                   id  favorite_count  retweet_count  \
0       2020-01-22  1220134040705142789              18             17   
1       2020-01-22  1220134028906463235               0              1   
2       2020-01-22  1220133978046259200               3              2   
3       2020-01-22  1220133958094151680               0              0   
4       2020-01-22  1220133790346956800               3              0   
...            ...                  ...             ...            ...   
273651  2020-03-31  1244776796500955136               1              1   
273652  2020-03-31  1244776600308195328               0              0   
273653  2020-03-31  1244776563297488896               6              1   
273654  2020-03-31  1244776510357151744               1              2   
273655  2020-03-31  1244776509652586499               0              0   

                                                     text  
0       AGW GREEN WARRIORS CENSOR 1st Amendmen

In [4]:
### Sentiment Analysis using an VADER sentimentIntensityAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

SIA = SentimentIntensityAnalyzer()
#list of sentiments
sentiments = []
#on the basis of each text in the dataframe 
for t in df["text"].values:
    sentiment_dict = SIA.polarity_scores(t)
    # add compound sentiment score to list 
    sentiments.append(sentiment_dict['compound']) 
# datafram column for sentiment using sentiment scores      
df["sentiment"]= sentiments

#make a csv file of sentiment data (for testing purposes)
df.to_csv("./../Data/Processed Data/data_with_sent.csv", index=True)
print("done")

In [5]:

# groupby date
group=df.groupby(by=["Date"])
# find aggregate of tweet id, favorite count, retweet coint and sentiment 
new_df=group.aggregate({
    "id":["count"],
    "favorite_count":["sum"],
    "retweet_count":["sum"],
    "sentiment":["mean"]
})
# rename id_str to tweet_count 
new_df.rename(columns={"id_str":"tweet_count"}, inplace=True)

print(new_df)
# new csv file for old_climate data
new_df.to_csv("./../Data/Processed Data/old_climate_data.csv", index=True)
print("done")

              id favorite_count retweet_count sentiment
           count            sum           sum      mean
Date                                                   
2020-01-22  9504          49563         20441  0.233904
2020-01-23  6427          43605         16548  0.073502
2020-01-24  5840          46585         16407  0.091441
2020-01-25  4194          39124         14920  0.080905
2020-01-26  3398          16882          7865  0.064814
...          ...            ...           ...       ...
2020-03-29  1585           7182          4709  0.000616
2020-03-30  2216           9481          4514  0.047642
2020-03-31  2540          14653          7291  0.012201
2020-04-01  2470          12600          5783  0.041370
2020-04-02  2766          11223          5240  0.138431

[72 rows x 4 columns]
done


In [6]:
# df for cornona cases
df_corona=pd.read_csv("./../Data/Processed Data/corona_data.csv")

#cut out first day becasue data is irregular and remove excess data at end
df_corona = df_corona.iloc[1:72]

#rename unnamed columns to Date
df_corona.rename( columns={'Unnamed: 0':'Date'}, inplace=True )

# convert strings to dates 
Date_list= df_corona["Date"].values
dates = []
for date in Date_list:
    dates.append(parse(date).date())
df_corona["Date"]=dates

print(df_corona)

          Date  total_cases  total_deaths  log_total_cases  log_total_deaths
1   2020-01-23          654            18         2.815578          1.255273
2   2020-01-24          941            26         2.973590          1.414973
3   2020-01-25         1434            42         3.156549          1.623249
4   2020-01-26         2118            56         3.325926          1.748188
5   2020-01-27         2927            82         3.466423          1.913814
..         ...          ...           ...              ...               ...
67  2020-03-29       720140         33925         5.857417          4.530520
68  2020-03-30       782389         37582         5.893423          4.574980
69  2020-03-31       857487         42107         5.933228          4.624354
70  2020-04-01       932605         47180         5.969698          4.673758
71  2020-04-02      1013466         52983         6.005809          4.724137

[71 rows x 5 columns]


In [14]:
# df for climate change tweets

df_c_Change=pd.read_csv("./../Data/Processed Data/old_climate_data.csv")
# remove unwanted label cols
df2=df_c_Change.drop([0, 1])
# drop first day to match corona df
df2 = df2.iloc[1:]
# reset indexes
df2.reset_index(drop=True, inplace=True)
# fix date col name
df2.rename( columns={'Unnamed: 0':'Date'}, inplace=True )
# convert strings to dates
Date_list= df2["Date"].values
dates = []
for date in Date_list:
    dates.append(parse(date).date())
df2["Date"]=dates

# rename misnamed columns
df2.rename( columns={'id':'tweet_count'}, inplace=True )

# Merge both dataframes into result dataframe
result = df_corona.merge(df2, on='Date')
print(result)
result.to_csv("./../Data/Processed Data/cleaned_data.csv", index=False)

          Date  total_cases  total_deaths  log_total_cases  log_total_deaths  \
0   2020-01-23          654            18         2.815578          1.255273   
1   2020-01-24          941            26         2.973590          1.414973   
2   2020-01-25         1434            42         3.156549          1.623249   
3   2020-01-26         2118            56         3.325926          1.748188   
4   2020-01-27         2927            82         3.466423          1.913814   
..         ...          ...           ...              ...               ...   
66  2020-03-29       720140         33925         5.857417          4.530520   
67  2020-03-30       782389         37582         5.893423          4.574980   
68  2020-03-31       857487         42107         5.933228          4.624354   
69  2020-04-01       932605         47180         5.969698          4.673758   
70  2020-04-02      1013466         52983         6.005809          4.724137   

   tweet_count favorite_count retweet_c