In [1]:
# import packages
import pandas as pd
import statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols
import tweepy
import json
import os
from collections import Counter
import numpy as np
from collections import defaultdict
from matplotlib import pyplot as plt
import time
from datetime import datetime
from dateutil.parser import parse

In [5]:
# load json file of tweets from twitter crawler into list of dicts
def load_json(file_name):
    result=[]
    with open(file_name, 'r') as fp:
        line=fp.readline()
        while line:
            tjson=json.loads(line) #decode json
            result.append({
                "Date":tjson["created_at"],
                "id":tjson["id_str"],
                "user_name":tjson["user"]["name"], 
                "favorite_count":tjson["favorite_count"], #number of favorites
                "retweet_count":tjson["retweet_count"], #get number of retweets
                "user_id_str":tjson["user"]["id_str"], #get user id_str
                "text":tjson["text"]
            })

            line=fp.readline()
    return result

# load tweets from getoldtweets3 into list of dicts
def load_old_json(file_name):
    result=[]
    with open(file_name, 'r') as fp:
        line=fp.readline()
        while line:
            tjson=json.loads(line) #decode json
            result.append({
                "Date":tjson["Date"],
                "id":tjson["id"],
                "favorite_count":tjson["favorite_count"], #number of favorites
                "retweet_count":tjson["retweet_count"], #number of retweets
                "text":tjson["text"]
            })
            line=fp.readline()
    return result


In [10]:
# load tweet data
#data_list=load_json("tweets.json")
data_list=load_old_json("old_tweets.json")
# create a df
df=pd.DataFrame(data=data_list)
print('done')

#converting date into more readable form 
Date_list= df["Date"].values
dates = []
for date in Date_list:
    dates.append(parse(date).date())
df["Date"]=dates
print(df)

done
              Date                   id  favorite_count  retweet_count  \
0       2020-01-22  1220134040705142789              18             17   
1       2020-01-22  1220134028906463235               0              1   
2       2020-01-22  1220133978046259200               3              2   
3       2020-01-22  1220133958094151680               0              0   
4       2020-01-22  1220133790346956800               3              0   
...            ...                  ...             ...            ...   
273651  2020-03-31  1244776796500955136               1              1   
273652  2020-03-31  1244776600308195328               0              0   
273653  2020-03-31  1244776563297488896               6              1   
273654  2020-03-31  1244776510357151744               1              2   
273655  2020-03-31  1244776509652586499               0              0   

                                                     text  
0       AGW GREEN WARRIORS CENSOR 1st Amendmen

In [12]:
### Sentiment Analysis using an VADER sentimentIntensityAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

SIA = SentimentIntensityAnalyzer()
#list of sentiments
sentiments = []
#on the basis of each text in the dataframe 
for t in df["text"].values:
    sentiment_dict = SIA.polarity_scores(t)
    # add compound sentiment score to list 
    sentiments.append(sentiment_dict['compound']) 
# datafram column for sentiment using sentiment scores      
df["sentiment"]= sentiments

#make a csv file of sentiment data (for testing purposes)
df.to_csv("data_with_sent.csv", index=True)


In [13]:

# groupby date
group=df.groupby(by=["Date"])
# find aggregate of tweet id, favorite count, retweet coint and sentiment 
new_df=group.aggregate({
    "id":["count"],
    "favorite_count":["sum"],
    "retweet_count":["sum"],
    "sentiment":[sum]
})
# rename id_str to tweet_count 
new_df.rename(columns={"id_str":"tweet_count"}, inplace=True)

print(new_df)
# new csv file for old_climate data
new_df.to_csv("old_climate_data.csv", index=True)


              id favorite_count retweet_count  sentiment
           count            sum           sum        sum
Date                                                    
2020-01-22  9504          49563         20441  2223.0193
2020-01-23  6427          43605         16548   472.3989
2020-01-24  5840          46585         16407   534.0176
2020-01-25  4194          39124         14920   339.3161
2020-01-26  3398          16882          7865   220.2384
...          ...            ...           ...        ...
2020-03-29  1585           7182          4709     0.9771
2020-03-30  2216           9481          4514   105.5750
2020-03-31  2540          14653          7291    30.9918
2020-04-01  2470          12600          5783   102.1828
2020-04-02  2766          11223          5240   382.9010

[72 rows x 4 columns]
