# Frequency distribution of False-Partially False Retweets and Likes for each month until July 2020

We need the following packages:

In [1]:
import pandas as pd
import datetime
from dateutil.parser import parse
import json
import itertools
import altair as alt
import csv
from vega_datasets import data
import pytz
utc=pytz.UTC

To read the JSON file that has all the tweets, it is necessary to do:

In [2]:
csv_dataframe = pd.read_csv('dataset/FINAL_fakecovid_final_filtered_dataset_clean.csv',sep=";")
csv_dataframe['tweet_id'] = csv_dataframe['tweet_id'].astype(str)
csv_list = csv_dataframe.values.tolist()
lista_unica_csv=list(itertools.chain.from_iterable(csv_list))


data = []
with open('dataset/fakecovid_result_final_translated_full.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

## January 2020

We're interested in the "retweet_count" and "favorite_count" fields:

In [3]:
start = utc.localize(datetime.datetime(2020, 1, 1))
end = utc.localize(datetime.datetime(2020, 1, 31))

index=0
dates = ["2020-01-01"] 
lista = []
retweets = [0]
likes = [0]
category = ["partially false"]
mix = ["2020-01-01 partially false"]
for element in data:
    token_id = data[index]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    token = data[index]['created_at']
    d = parse(token)
   
    if start <= d <= end:
        
        d = d.strftime('%Y-%m-%d')
        a = d + " " + lista_unica_csv[indice_csv+1].lower()
        if a in mix:
            i = mix.index(a)
            retweets[i] = retweets[i] + data[index]['retweet_count']
            likes[i] = likes[i]+ data[index]['favorite_count']
        else:
            mix.append(a)
            dates.append(d)
            retweets.append(data[index]['retweet_count'])
            likes.append(data[index]['favorite_count'])
            category.append(lista_unica_csv[indice_csv+1].lower())
        
       
    index=index+1

We create the DataFrames which will be used to realize the chart, one for the Retweets count and one for the Likes count:

In [4]:
df_likes = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Category': category
    })
df_likes['Dates']= pd.to_datetime(df_likes['Dates'])

cl = df_likes['Category'].to_list()
cl_cat = [s + " likes" for s in cl]
df_likes['Category'] = cl_cat



df_retweets = pd.DataFrame(
    {'Dates': dates,
     'Retweets': retweets,
     'Category': category
    })

df_retweets['Dates']= pd.to_datetime(df_retweets['Dates'])
cr = df_retweets['Category'].to_list()
cr_cat = [s + " retweets" for s in cr]
df_retweets['Category'] = cr_cat



df = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Retweets': retweets,
     'Category': category
    })
df['Dates']= pd.to_datetime(df['Dates'])
df = df.sort_values(by=['Dates'], ascending=True)

We've normalised the count of Likes and Retweets using Min-Max Normalization in the scale of [0; 1].

In [5]:
#Normalizzare Like e Retweet.
#https://www.geeksforgeeks.org/normalize-a-column-in-pandas/

df_norm = df.copy()
df_norm[['Likes', 'Retweets']] = (df_norm[['Likes', 'Retweets']] - df_norm[['Likes', 'Retweets']].min()) / (df_norm[['Likes', 'Retweets']].max() - df_norm[['Likes', 'Retweets']].min())
df_likes['Likes'] = df_norm['Likes']
df_retweets['Retweets'] = df_norm['Retweets']

The line chart is created:

In [6]:
c = alt.Chart(df_retweets).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='January 2020'),
    alt.Y('Retweets', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=500
)


c1 = alt.Chart(df_likes).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='January 2020'),
    alt.Y('Likes', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=500
)


c2 = alt.layer(c,c1)
c2

We normalised the count of Retweets and Likes for the overall month together and plotted the normalised count of Retweets and Likes for both false and partially false and plotted it for each month, until July 2020.

## February 2020

In [7]:
start = utc.localize(datetime.datetime(2020, 2, 1))
end = utc.localize(datetime.datetime(2020, 2, 29))

index=0
dates = ["2020-02-01"] 
lista = []
retweets = [0]
likes = [0]
category = ["partially false"]
mix = ["2020-02-01 partially false"]
for element in data:
    token_id = data[index]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    token = data[index]['created_at']
    d = parse(token)
   
    if start <= d <= end:
        
        d = d.strftime('%Y-%m-%d')
        a = d + " " + lista_unica_csv[indice_csv+1].lower()
        if a in mix:
            i = mix.index(a)
            retweets[i] = retweets[i] + data[index]['retweet_count']
            likes[i] = likes[i]+ data[index]['favorite_count']
        else:
            mix.append(a)
            dates.append(d)
            retweets.append(data[index]['retweet_count'])
            likes.append(data[index]['favorite_count'])
            category.append(lista_unica_csv[indice_csv+1].lower())
        
       
    index=index+1


df_likes = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Category': category
    })
df_likes['Dates']= pd.to_datetime(df_likes['Dates'])

cl = df_likes['Category'].to_list()
cl_cat = [s + " likes" for s in cl]
df_likes['Category'] = cl_cat



df_retweets = pd.DataFrame(
    {'Dates': dates,
     'Retweets': retweets,
     'Category': category
    })
df_retweets['Dates']= pd.to_datetime(df_retweets['Dates'])
cr = df_retweets['Category'].to_list()
cr_cat = [s + " retweets" for s in cr]
df_retweets['Category'] = cr_cat



df = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Retweets': retweets,
     'Category': category
    })
df['Dates']= pd.to_datetime(df['Dates'])
df = df.sort_values(by=['Dates'], ascending=True)


df_norm = df.copy()
df_norm[['Likes', 'Retweets']] = (df_norm[['Likes', 'Retweets']] - df_norm[['Likes', 'Retweets']].min()) / (df_norm[['Likes', 'Retweets']].max() - df_norm[['Likes', 'Retweets']].min())
df_likes['Likes'] = df_norm['Likes']
df_retweets['Retweets'] = df_norm['Retweets']



c = alt.Chart(df_retweets).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='February 2020'),
    alt.Y('Retweets', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=500
)


c1 = alt.Chart(df_likes).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='February 2020'),
    alt.Y('Likes', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=300
)


c2 = alt.layer(c,c1)
c2

## March 2020

In [8]:
start = utc.localize(datetime.datetime(2020, 3, 1))
end = utc.localize(datetime.datetime(2020, 3, 31))

index=0
dates = ["2020-03-01"] 
lista = []
retweets = [0]
likes = [0]
category = ["partially false"]
mix = ["2020-03-01 partially false"]
for element in data:
    token_id = data[index]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    token = data[index]['created_at']
    d = parse(token)
   
    if start <= d <= end:
        
        d = d.strftime('%Y-%m-%d')
        a = d + " " + lista_unica_csv[indice_csv+1].lower()
        if a in mix:
            i = mix.index(a)
            retweets[i] = retweets[i] + data[index]['retweet_count']
            likes[i] = likes[i]+ data[index]['favorite_count']
        else:
            mix.append(a)
            dates.append(d)
            retweets.append(data[index]['retweet_count'])
            likes.append(data[index]['favorite_count'])
            category.append(lista_unica_csv[indice_csv+1].lower())
        
       
    index=index+1


df_likes = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Category': category
    })
df_likes['Dates']= pd.to_datetime(df_likes['Dates'])

cl = df_likes['Category'].to_list()
cl_cat = [s + " likes" for s in cl]
df_likes['Category'] = cl_cat



df_retweets = pd.DataFrame(
    {'Dates': dates,
     'Retweets': retweets,
     'Category': category
    })
df_retweets['Dates']= pd.to_datetime(df_retweets['Dates'])
cr = df_retweets['Category'].to_list()
cr_cat = [s + " retweets" for s in cr]
df_retweets['Category'] = cr_cat



df = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Retweets': retweets,
     'Category': category
    })
df['Dates']= pd.to_datetime(df['Dates'])
df = df.sort_values(by=['Dates'], ascending=True)


df_norm = df.copy()
df_norm[['Likes', 'Retweets']] = (df_norm[['Likes', 'Retweets']] - df_norm[['Likes', 'Retweets']].min()) / (df_norm[['Likes', 'Retweets']].max() - df_norm[['Likes', 'Retweets']].min())
df_likes['Likes'] = df_norm['Likes']
df_retweets['Retweets'] = df_norm['Retweets']



c = alt.Chart(df_retweets).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='March 2020'),
    alt.Y('Retweets', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=500
)


c1 = alt.Chart(df_likes).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='March 2020'),
    alt.Y('Likes', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=300
)


c2 = alt.layer(c,c1)
c2

## April 2020

In [9]:
start = utc.localize(datetime.datetime(2020, 4, 1))
end = utc.localize(datetime.datetime(2020, 4, 30))

index=0
dates = ["2020-04-01"] 
lista = []
retweets = [0]
likes = [0]
category = ["partially false"]
mix = ["2020-04-01 partially false"]
for element in data:
    token_id = data[index]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    token = data[index]['created_at']
    d = parse(token)
   
    if start <= d <= end:
        
        d = d.strftime('%Y-%m-%d')
        a = d + " " + lista_unica_csv[indice_csv+1].lower()
        if a in mix:
            i = mix.index(a)
            retweets[i] = retweets[i] + data[index]['retweet_count']
            likes[i] = likes[i]+ data[index]['favorite_count']
        else:
            mix.append(a)
            dates.append(d)
            retweets.append(data[index]['retweet_count'])
            likes.append(data[index]['favorite_count'])
            category.append(lista_unica_csv[indice_csv+1].lower())
        
       
    index=index+1


df_likes = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Category': category
    })
df_likes['Dates']= pd.to_datetime(df_likes['Dates'])

cl = df_likes['Category'].to_list()
cl_cat = [s + " likes" for s in cl]
df_likes['Category'] = cl_cat



df_retweets = pd.DataFrame(
    {'Dates': dates,
     'Retweets': retweets,
     'Category': category
    })
df_retweets['Dates']= pd.to_datetime(df_retweets['Dates'])
cr = df_retweets['Category'].to_list()
cr_cat = [s + " retweets" for s in cr]
df_retweets['Category'] = cr_cat



df = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Retweets': retweets,
     'Category': category
    })
df['Dates']= pd.to_datetime(df['Dates'])
df = df.sort_values(by=['Dates'], ascending=True)


df_norm = df.copy()
df_norm[['Likes', 'Retweets']] = (df_norm[['Likes', 'Retweets']] - df_norm[['Likes', 'Retweets']].min()) / (df_norm[['Likes', 'Retweets']].max() - df_norm[['Likes', 'Retweets']].min())
df_likes['Likes'] = df_norm['Likes']
df_retweets['Retweets'] = df_norm['Retweets']



c = alt.Chart(df_retweets).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='April 2020'),
    alt.Y('Retweets', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=500
)


c1 = alt.Chart(df_likes).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='April 2020'),
    alt.Y('Likes', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=300
)


c2 = alt.layer(c,c1)
c2

## May 2020

In [10]:
start = utc.localize(datetime.datetime(2020, 5, 1))
end = utc.localize(datetime.datetime(2020, 5, 31))

index=0
dates = ["2020-05-01"] 
lista = []
retweets = [0]
likes = [0]
category = ["partially false"]
mix = ["2020-05-01 partially false"]
for element in data:
    token_id = data[index]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    token = data[index]['created_at']
    d = parse(token)
   
    if start <= d <= end:
        
        d = d.strftime('%Y-%m-%d')
        a = d + " " + lista_unica_csv[indice_csv+1].lower()
        if a in mix:
            i = mix.index(a)
            retweets[i] = retweets[i] + data[index]['retweet_count']
            likes[i] = likes[i]+ data[index]['favorite_count']
        else:
            mix.append(a)
            dates.append(d)
            retweets.append(data[index]['retweet_count'])
            likes.append(data[index]['favorite_count'])
            category.append(lista_unica_csv[indice_csv+1].lower())
        
       
    index=index+1


df_likes = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Category': category
    })
df_likes['Dates']= pd.to_datetime(df_likes['Dates'])

cl = df_likes['Category'].to_list()
cl_cat = [s + " likes" for s in cl]
df_likes['Category'] = cl_cat



df_retweets = pd.DataFrame(
    {'Dates': dates,
     'Retweets': retweets,
     'Category': category
    })
df_retweets['Dates']= pd.to_datetime(df_retweets['Dates'])
cr = df_retweets['Category'].to_list()
cr_cat = [s + " retweets" for s in cr]
df_retweets['Category'] = cr_cat



df = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Retweets': retweets,
     'Category': category
    })
df['Dates']= pd.to_datetime(df['Dates'])
df = df.sort_values(by=['Dates'], ascending=True)


df_norm = df.copy()
df_norm[['Likes', 'Retweets']] = (df_norm[['Likes', 'Retweets']] - df_norm[['Likes', 'Retweets']].min()) / (df_norm[['Likes', 'Retweets']].max() - df_norm[['Likes', 'Retweets']].min())
df_likes['Likes'] = df_norm['Likes']
df_retweets['Retweets'] = df_norm['Retweets']



c = alt.Chart(df_retweets).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='May 2020'),
    alt.Y('Retweets', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=500
)


c1 = alt.Chart(df_likes).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='May 2020'),
    alt.Y('Likes', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=300
)


c2 = alt.layer(c,c1)
c2

## June 2020

In [11]:
start = utc.localize(datetime.datetime(2020, 6, 1))
end = utc.localize(datetime.datetime(2020, 6, 30))

index=0
dates = ["2020-06-01"] 
lista = []
retweets = [0]
likes = [0]
category = ["partially false"]
mix = ["2020-06-01 partially false"]
for element in data:
    token_id = data[index]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    token = data[index]['created_at']
    d = parse(token)
   
    if start <= d <= end:
        
        d = d.strftime('%Y-%m-%d')
        a = d + " " + lista_unica_csv[indice_csv+1].lower()
        if a in mix:
            i = mix.index(a)
            retweets[i] = retweets[i] + data[index]['retweet_count']
            likes[i] = likes[i]+ data[index]['favorite_count']
        else:
            mix.append(a)
            dates.append(d)
            retweets.append(data[index]['retweet_count'])
            likes.append(data[index]['favorite_count'])
            category.append(lista_unica_csv[indice_csv+1].lower())
        
       
    index=index+1


df_likes = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Category': category
    })
df_likes['Dates']= pd.to_datetime(df_likes['Dates'])

cl = df_likes['Category'].to_list()
cl_cat = [s + " likes" for s in cl]
df_likes['Category'] = cl_cat



df_retweets = pd.DataFrame(
    {'Dates': dates,
     'Retweets': retweets,
     'Category': category
    })
df_retweets['Dates']= pd.to_datetime(df_retweets['Dates'])
cr = df_retweets['Category'].to_list()
cr_cat = [s + " retweets" for s in cr]
df_retweets['Category'] = cr_cat



df = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Retweets': retweets,
     'Category': category
    })
df['Dates']= pd.to_datetime(df['Dates'])
df = df.sort_values(by=['Dates'], ascending=True)


df_norm = df.copy()
df_norm[['Likes', 'Retweets']] = (df_norm[['Likes', 'Retweets']] - df_norm[['Likes', 'Retweets']].min()) / (df_norm[['Likes', 'Retweets']].max() - df_norm[['Likes', 'Retweets']].min())
df_likes['Likes'] = df_norm['Likes']
df_retweets['Retweets'] = df_norm['Retweets']



c = alt.Chart(df_retweets).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='June 2020'),
    alt.Y('Retweets', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=500
)


c1 = alt.Chart(df_likes).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='June 2020'),
    alt.Y('Likes', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=300
)


c2 = alt.layer(c,c1)
c2

## July 2020

In [12]:
start = utc.localize(datetime.datetime(2020, 7, 1))
end = utc.localize(datetime.datetime(2020, 7, 31))

index=0
dates = ["2020-07-01"] 
lista = []
retweets = [0]
likes = [0]
category = ["partially false"]
mix = ["2020-07-01 partially false"]
for element in data:
    token_id = data[index]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    token = data[index]['created_at']
    d = parse(token)
   
    if start <= d <= end:
        
        d = d.strftime('%Y-%m-%d')
        a = d + " " + lista_unica_csv[indice_csv+1].lower()
        if a in mix:
            i = mix.index(a)
            retweets[i] = retweets[i] + data[index]['retweet_count']
            likes[i] = likes[i]+ data[index]['favorite_count']
        else:
            mix.append(a)
            dates.append(d)
            retweets.append(data[index]['retweet_count'])
            likes.append(data[index]['favorite_count'])
            category.append(lista_unica_csv[indice_csv+1].lower())
        
       
    index=index+1


df_likes = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Category': category
    })
df_likes['Dates']= pd.to_datetime(df_likes['Dates'])

cl = df_likes['Category'].to_list()
cl_cat = [s + " likes" for s in cl]
df_likes['Category'] = cl_cat



df_retweets = pd.DataFrame(
    {'Dates': dates,
     'Retweets': retweets,
     'Category': category
    })
df_retweets['Dates']= pd.to_datetime(df_retweets['Dates'])
cr = df_retweets['Category'].to_list()
cr_cat = [s + " retweets" for s in cr]
df_retweets['Category'] = cr_cat



df = pd.DataFrame(
    {'Dates': dates,
     'Likes': likes,
     'Retweets': retweets,
     'Category': category
    })
df['Dates']= pd.to_datetime(df['Dates'])
df = df.sort_values(by=['Dates'], ascending=True)


df_norm = df.copy()
df_norm[['Likes', 'Retweets']] = (df_norm[['Likes', 'Retweets']] - df_norm[['Likes', 'Retweets']].min()) / (df_norm[['Likes', 'Retweets']].max() - df_norm[['Likes', 'Retweets']].min())
df_likes['Likes'] = df_norm['Likes']
df_retweets['Retweets'] = df_norm['Retweets']



c = alt.Chart(df_retweets).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='July 2020'),
    alt.Y('Retweets', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=500
)


c1 = alt.Chart(df_likes).mark_line(point=True).encode(
    alt.X('monthdate(Dates):T', title='July 2020'),
    alt.Y('Likes', title='Normalised count'),
    color="Category"
).properties(
    width=700,
    height=300
)


c2 = alt.layer(c,c1)
c2