# Frequency distribution of Retweets and Likes for each month until July 2020

We need the following packages:

In [1]:
import pandas as pd
import datetime
from dateutil.parser import parse
import json
import itertools
import altair as alt
import csv
from vega_datasets import data
import pytz
utc=pytz.UTC

To read the JSON file that has all the tweets, it is necessary to do:

In [2]:
data = []
with open('dataset/general_result_translated_full.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

## January 2020

We're interested in the "retweet_count" and "favorite_count" fields:

In [3]:
start = utc.localize(datetime.datetime(2020, 1, 1))
end = utc.localize(datetime.datetime(2020, 1, 31))

index=0
dates = ["2020-01-14"] 
retweets = [0]
likes = [0]

for element in data:
    token = data[index]['created_at']
    d = parse(token)
   
    if start <= d <= end:
        
        d = d.strftime('%Y-%m-%d')
        dates.append(d)
        retweets.append(data[index]['retweet_count'])
        likes.append(data[index]['favorite_count'])
        
       
    index=index+1

# Fill the empty dates with 0.
start = datetime.datetime.strptime("2020-01-01", "%Y-%m-%d")
end = datetime.datetime.strptime("2020-01-31", "%Y-%m-%d")
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]

We create the DataFrames which will be used to realize the chart, one for the Retweets count and one for the Likes count:

In [4]:
df_likes_jan = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes
    })

df_likes_jan['Dates']= pd.to_datetime(df_likes_jan['Dates'])



df_retweets_jan = pd.DataFrame(
    {'Dates': dates,
     'Retweets': retweets
    })

df_retweets_jan['Dates']= pd.to_datetime(df_retweets_jan['Dates'])



df_jan = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Retweets': retweets
    })

df_jan['Dates']= pd.to_datetime(df_jan['Dates'])
df_jan = df_jan.sort_values(by=['Dates'], ascending=True)

We've normalised the count of Likes and Retweets using Min-Max Normalization in the scale of [0; 1].

In [5]:
# Likes and Retweets normalization.
# https://www.geeksforgeeks.org/normalize-a-column-in-pandas/

df_norm_jan = df_jan.copy()
df_norm_jan[['Likes', 'Retweets']] = (df_norm_jan[['Likes', 'Retweets']] - df_norm_jan[['Likes', 'Retweets']].min()) / (df_norm_jan[['Likes', 'Retweets']].max() - df_norm_jan[['Likes', 'Retweets']].min())
df_likes_jan['Likes'] = df_norm_jan['Likes']
df_retweets_jan['Retweets'] = df_norm_jan['Retweets']

The line chart is created:

In [6]:
alt.data_transformers.disable_max_rows()

c1_jan = alt.Chart(df_likes_jan).mark_line().encode(
    x="monthdate(Dates):T",
    y="Likes"
).properties(
    width=300,
    height=200
)
c1_jan.encoding.x.title = 'Dates'

c2_jan = alt.Chart(df_retweets_jan).mark_line().encode(
    x="monthdate(Dates):T",
    y="Retweets"
).properties(
    width=300,
    height=200
)
c2_jan.encoding.x.title = 'Dates'

alt.hconcat(c1_jan, c2_jan)

We normalised the count of Retweets and Likes for the overall month together and plotted the normalised count of Retweets and Likes for both false and partially false and plotted it for each month, until July 2020.

## February 2020

## March 2020

## April 2020

## May 2020

## June 2020

## July 2020