# Popular Words grouped bar chart - Fake Covid-19 dataset

We've used the following packages:

In [1]:
import pandas as pd
import numpy as np
import json
import string
import re
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import emoji
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import itertools
import altair as alt

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

Then we have defined the following functions to clean the tweets' text:

In [2]:
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_urls(text):
    result = re.sub(r"http\S+", "", text)
    return(result)

def remove_twitter_urls(text):
    clean = re.sub(r"pic.twitter\S+", "",text)
    return(clean)

def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(r'', text)

def noamp(text):
    clean = re.sub("&amp", " ",text)
    return (clean)

In order to do the classification of the tweets, we need to read the csv file and the json file:

In [3]:
csv_dataframe = pd.read_csv('dataset/FINAL_fakecovid_final_filtered_dataset_clean.csv',sep=";")
csv_dataframe['tweet_id'] = csv_dataframe['tweet_id'].astype(str)
csv_list = csv_dataframe.values.tolist()
lista_unica_csv=list(itertools.chain.from_iterable(csv_list))

data = []
with open('dataset/fakecovid_result_final_translated_full.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

## Terms regarding Covid

We're interested in the "full_text" field, that it has been cleaned with specific functions:

In [4]:
index=0
comment_words = []
stop_words = stopwords.words('english')
for element in data:
    token_id = data[index]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    
    data[index]['full_text'] = data[index]['full_text'].lower()                 # Put everything in lowercase
    #data[index]['full_text'] = contractions.fix(data[index]['full_text'])
    data[index]['full_text'] = re.sub("\'\w+", '', data[index]['full_text'])    # Remove everything after '
    data[index]['full_text'] = remove_urls(data[index]['full_text'])
    data[index]['full_text'] = remove_twitter_urls(data[index]['full_text'])
    data[index]['full_text'] = remove_emoticons(data[index]['full_text'])
    data[index]['full_text'] = remove_emoji(data[index]['full_text'])
    data[index]['full_text'] = give_emoji_free_text(data[index]['full_text'])
    data[index]['full_text'] = noamp(data[index]['full_text'])                  # No amp with space
    data[index]['full_text'] = re.sub("#\S+", " ",  data[index]['full_text'])   # Remove hashtags
    data[index]['full_text'] = re.sub("@\S+", " ",  data[index]['full_text'])   # No mentions
    data[index]['full_text'] = data[index]['full_text'].translate(str.maketrans('', '', string.punctuation)) # No puntuaction
    data[index]['full_text'] = data[index]['full_text'].encode('ascii', 'ignore').decode() # No unicode
    data[index]['full_text'] = re.sub("^rt ", " ", data[index]['full_text'])    # No RT
    data[index]['full_text'] = re.sub('\s{2,}', " ", data[index]['full_text'])  # Remove big spaces

    tokens=data[index]['full_text'].split()

    for t in tokens:
        if t not in stop_words:
            final_token = t + " " +lista_unica_csv[indice_csv+1].lower().replace(" ", "")
            comment_words.append(final_token)
            
    index=index+1

Let's create the chart:

In [5]:
fdist = dict(nltk.FreqDist(comment_words))


df = pd.DataFrame.from_dict(fdist, orient='index').reset_index()
df = df.rename(columns={'index':'Words', 0:'Count'})
col_one_list = df['Words'].tolist()
col_two_list = df['Count'].tolist()
typelist=[]
namelist=[]

index = 0

count_false = [0] * len(col_one_list)
count_part = [0] * len(col_one_list)

for el in col_one_list:
    tok = el.split()
    namelist.append(tok[0])
    if tok[0] in namelist:
        indx = namelist.index(tok[0])
        if tok[1] == "false":
            count_false[indx] = col_two_list[index]
        elif tok[1] == "partiallyfalse":
            count_part[indx] = col_two_list[index]
        else:
            print("errore count")
    index = index + 1

i=0
for el in col_two_list:
    col_two_list[i] = count_false[i] + count_part[i]
    i = i + 1

df['Words']=namelist
df['False']=count_false
df['Partially False']=count_part
df['Count']=col_two_list

df = df.sort_values(by=['Count'],ascending=[False])

range_ = ["#0C7BDC","#FFC20A"]


bars = alt.Chart(df).transform_fold(
    ['False', 'Partially False']
).mark_bar().encode(
    x=alt.X('key:N',title=None),
    y=alt.Y('value:Q',title="Tweet count"),
    color=alt.Color('key:N', scale=alt.Scale(range=range_),title="Category"),
    column="Words:N"
).transform_filter(
    alt.FieldRangePredicate(field='Count', range=[79, 212])
).properties(
    title="The most frequent Words classified by category (False, Partually False)", 
    width=75
).configure_title(
    fontSize=17,
    offset=25
).configure_axis(
    labelFontSize=13,
    titleFontSize=15,
    titlePadding=15
).configure_legend(
    titleFontSize=14,
    labelFontSize=12,
    titlePadding=10
).configure_header(
    titleFontSize=15,
    labelFontSize=12
)

bars

## Terms not regarding Covid

We have also created a word cloud without terms regarding Covid, so we have filtered these words from the "full_text" field:

In [6]:
index_nocovid=0
comment_words_nocovid = []
stopwordss = stopwords.words('english') + ["coronawarriors","covid19australia","vaccine","coronacrisis","coronaviruskenya","covidiots","covid19uk","lockdownsa","covidiot","chinesevirus","pandemic","coronaviruslockdown","quarantinelife","sarscov2","coronalockdown","coronaupdate","covid19us","nomeat_nocoronavirus","covid19india","coronavirusupdate","quarantine","lockdown","chinavirus","coronaviruschina","coronavirusuk","wuhanvirus","coronavirusupdates","covid19pandemic","coronavirususa","covid19nigeria","coronaviruschina","coronarvirus","coronaoutbreak","ncov2019","2019ncov","chinacoronavirus","coronavirussa","wuhancoronavirus","coronarovirus","indiafightscorona","covid19", "coronavirus", "corona", "covid_19", "covid","coronavirusoutbreak","covid2019", "virus", "covid__19","covid19aus", "coronavirusindia","covidー19", "coronaviruspandemic"]
for element in data:
    token_id = data[index_nocovid]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    
    data[index_nocovid]['full_text'] = data[index_nocovid]['full_text'].lower()                # Put everything in lowercase
    #data[index]['full_text'] = contractions.fix(data[index]['full_text'])
    data[index_nocovid]['full_text'] = re.sub("\'\w+", '', data[index_nocovid]['full_text'])   # Remove everything after '
    data[index_nocovid]['full_text'] = remove_urls(data[index_nocovid]['full_text'])
    data[index_nocovid]['full_text'] = remove_twitter_urls(data[index_nocovid]['full_text'])
    data[index_nocovid]['full_text'] = remove_emoticons(data[index_nocovid]['full_text'])
    data[index_nocovid]['full_text'] = remove_emoji(data[index_nocovid]['full_text'])
    data[index_nocovid]['full_text'] = give_emoji_free_text(data[index_nocovid]['full_text'])
    data[index_nocovid]['full_text'] = noamp(data[index_nocovid]['full_text'])                 # No amp with space
    data[index_nocovid]['full_text'] = re.sub("#\S+", " ",  data[index_nocovid]['full_text'])  # Remove hashtags
    data[index_nocovid]['full_text'] = re.sub("@\S+", " ",  data[index_nocovid]['full_text'])  # No mentions
    data[index_nocovid]['full_text'] = data[index_nocovid]['full_text'].translate(str.maketrans('', '', string.punctuation)) # No puntuaction
    data[index_nocovid]['full_text'] = data[index_nocovid]['full_text'].encode('ascii', 'ignore').decode() # No unicode
    data[index_nocovid]['full_text'] = re.sub("^rt ", " ", data[index_nocovid]['full_text'])   # No RT
    data[index_nocovid]['full_text'] = re.sub('\s{2,}', " ", data[index_nocovid]['full_text']) # Remove big spaces

    
    tokens_nocovid=data[index_nocovid]['full_text'].split()

    for t in tokens_nocovid:
        if t not in stopwordss:
            final_token = t + " " +lista_unica_csv[indice_csv+1].lower().replace(" ", "")
            comment_words_nocovid.append(final_token)
    
    index_nocovid=index_nocovid+1

Let's create the chart:

In [7]:
fdist_mentions = dict(nltk.FreqDist(comment_words_nocovid))
#print(fdist_mentions)
df_mentions = pd.DataFrame.from_dict(fdist_mentions, orient='index').reset_index()
df_mentions = df_mentions.rename(columns={'index':'Words', 0:'Count'})
col_one_list_mentions = df_mentions['Words'].tolist()
col_two_list_mentions = df_mentions['Count'].tolist()

#print(df_mentions)

typelist_mentions=[]
namelist_mentions=[]

indexm = 0

count_false_mentions = [0] * len(col_one_list_mentions)
count_part_mentions = [0] * len(col_one_list_mentions)

for el in col_one_list_mentions:
    tokm = el.split()
    namelist_mentions.append(tokm[0])
    #typelist.append(tok[1])
    if tokm[0] in namelist_mentions:
        indxm = namelist_mentions.index(tokm[0])
        if tokm[1] == "false":
            count_false_mentions[indxm] = col_two_list_mentions[indexm]
        elif tokm[1] == "partiallyfalse":
            count_part_mentions[indxm] = col_two_list_mentions[indexm]
        else:
            print("errore count")
   
    indexm = indexm + 1

im=0
for el in col_two_list_mentions:
    col_two_list_mentions[im] = count_false_mentions[im] + count_part_mentions[im]
    im = im + 1


df_mentions['Words']=namelist_mentions
df_mentions['False']=count_false_mentions
df_mentions['Partially False']=count_part_mentions
df_mentions['Count'] = col_two_list_mentions
#del df['count']

#df['type']=typelist

df_mentions = df_mentions.sort_values(by=['Count'],ascending=[False])


range_ = ["#40B0A6","#E1BE6A"]

barsm = alt.Chart(df_mentions).transform_fold(
    ['False', 'Partially False'],
).mark_bar().encode(
    x=alt.X('key:N',title=None),
    y=alt.Y('value:Q',title="Tweet count"),
    color=alt.Color('key:N', scale=alt.Scale(range=range_),title="Category"),
    column="Words:N"
).transform_filter(
    alt.FieldRangePredicate(field='Count', range=[71, 154])
).properties(
    title="The most frequent words classified by category (False, Partually False) WITHOUT COVID-RELATED WORDS",
    width=75
).configure_title(
    fontSize=17,
    offset=25
).configure_axis(
    labelFontSize=13,
    titleFontSize=15,
    titlePadding=15
).configure_legend(
    titleFontSize=14,
    labelFontSize=12,
    titlePadding=10
).configure_header(
    titleFontSize=15,
    labelFontSize=12
)

barsm