# Bigrams and Trigrams grouped bar charts - Fake Covid-19 dataset

In order to create the word clouds, we need the following packages:

In [1]:
from itertools import tee, islice 
import pandas as pd
import numpy as np
import json
import string
import re
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import emoji
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt 
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder,TrigramCollocationFinder, TrigramAssocMeasures
from operator import itemgetter
import itertools
import contractions
from nltk.corpus import stopwords
import num2words
from PIL import Image
import altair as alt
import itertools
from matplotlib.colors import ListedColormap
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

Then we have defined the following functions to clean the tweets' text:

In [2]:
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_urls(text):
    result = re.sub(r"http\S+", "", text)
    return(result)

def remove_twitter_urls(text):
    clean = re.sub(r"pic.twitter\S+", "",text)
    return(clean)

def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(r'', text)

def noamp(text):
    clean = re.sub("&amp", " ",text)
    return (clean)

To read the JSON file that has all the tweets, it is necessary to do:

In [3]:
csv_dataframe = pd.read_csv('dataset/FINAL_fakecovid_final_filtered_dataset_clean.csv',sep=";")
csv_dataframe['tweet_id'] = csv_dataframe['tweet_id'].astype(str)
csv_list = csv_dataframe.values.tolist()
lista_unica_csv=list(itertools.chain.from_iterable(csv_list))

data = []
with open('dataset/fakecovid_result_final_translated_full.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

## Bigrams

We're interested in the "full_text" field, that it has been cleaned with specific functions:

In [4]:
index=0
stop_words = stopwords.words('english')
new_bigram=[]
for element in data:
    token_id = data[index]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    
    data[index]['full_text'] = data[index]['full_text'].lower()                      # Put everything in lowercase
    data[index]['full_text'] = contractions.fix(data[index]['full_text'])
    #data[index]['full_text'] = re.sub("\'\w+", '', data[index]['full_text'])        # Remove everything after '
    data[index]['full_text'] = remove_urls(data[index]['full_text'])
    data[index]['full_text'] = remove_twitter_urls(data[index]['full_text'])
    data[index]['full_text'] = remove_emoticons(data[index]['full_text'])
    data[index]['full_text'] = remove_emoji(data[index]['full_text'])
    data[index]['full_text'] = give_emoji_free_text(data[index]['full_text'])
    data[index]['full_text'] = noamp(data[index]['full_text'])                        # No amp with space
    data[index]['full_text'] = re.sub("#\S+", " ",  data[index]['full_text'])         # Remove hashtags
    data[index]['full_text'] = re.sub("@\S+", " ",  data[index]['full_text'])         # No mentions
    data[index]['full_text'] = data[index]['full_text'].translate(str.maketrans('', '', string.punctuation)) # No puntuaction
    data[index]['full_text'] = data[index]['full_text'].encode('ascii', 'ignore').decode() # No unicode
    data[index]['full_text'] = re.sub("^rt ", " ", data[index]['full_text'])          # No RT
    data[index]['full_text'] = re.sub('\s{2,}', " ", data[index]['full_text'])        # Remove big spaces
    bigram_tokens=list(nltk.bigrams(nltk.word_tokenize(data[index]['full_text'])))
    #print(bigram_tokens)
    clean_bigram_tokens = [gram for gram in bigram_tokens if not any(stop in gram for stop in stop_words)]
    for c in clean_bigram_tokens:
        final_token = ' '.join(c) + " " +lista_unica_csv[indice_csv+1].lower().replace(" ", "")
        #print(final_token)
        new_bigram.append(final_token)
    #new_bigram.append(clean_bigram_tokens)
    index=index+1

The dictionary and the dataframe are created, then the chart is plotted:

In [5]:

fdist_bi = dict(nltk.FreqDist(new_bigram))
#print(fdist_bi)


df = pd.DataFrame.from_dict(fdist_bi, orient='index').reset_index()
df = df.rename(columns={'index':'Bigrams', 0:'Count'})
col_one_list = df['Bigrams'].tolist()
col_two_list = df['Count'].tolist()
typelist=[]
namelist=[]

index = 0

count_false = [0] * len(col_one_list)
count_part = [0] * len(col_one_list)

for el in col_one_list:
    tok = el.split()
    a = tok[0] + " " + tok[1]
    namelist.append(a)
    if a in namelist:
        indx = namelist.index(a)
        if tok[2] == "false":
            count_false[indx] = col_two_list[index]
        elif tok[2] == "partiallyfalse":
            count_part[indx] = col_two_list[index]
        else:
            print("errore count")
    index = index + 1

i=0
for el in col_two_list:
    col_two_list[i] = count_false[i] + count_part[i]
    i = i + 1

df['Bigrams']=namelist
df['False']=count_false
df['Partially False']=count_part
df['Count']=col_two_list

df = df.sort_values(by=['Count'],ascending=[False])
#print(df.head(10))

range_ = ["#0C7BDC","#FFC20A"]


bars = alt.Chart(df).transform_fold(
    ['False', 'Partially False']
).mark_bar().encode(
    x=alt.X('key:N',title=None),
    y=alt.Y('value:Q',title="Tweet count"),
    color=alt.Color('key:N', scale=alt.Scale(range=range_),title="Category"),
    column="Bigrams:N"
).transform_filter(
    alt.FieldRangePredicate(field='Count', range=[12, 30])
).properties(
    title="The most frequent bigrams classified by category (False, Partually False)", 
    width=75
).configure_title(
    fontSize=17,
    offset=25
).configure_axis(
    labelFontSize=13,
    titleFontSize=15,
    titlePadding=15
).configure_legend(
    titleFontSize=14,
    labelFontSize=12,
    titlePadding=10
).configure_header(
    titleFontSize=15,
    labelFontSize=12
)

bars

## Trigrams

We're interested in the "full_text" field, that it has been cleaned with specific functions:

In [6]:
index_tri=0
stop_words_tri = stopwords.words('english')
new_trigram=[]
for element in data:
    token_id = data[index_tri]['id_str']
    indice_csv = lista_unica_csv.index(token_id)
    
    data[index_tri]['full_text'] = data[index_tri]['full_text'].lower()                # Put everything in lowercase
    data[index_tri]['full_text'] = contractions.fix(data[index_tri]['full_text'])
    #data[index_tri]['full_text'] = re.sub("\'\w+", '', data[index_tri]['full_text'])  # Remove everything after '
    data[index_tri]['full_text'] = remove_urls(data[index_tri]['full_text'])
    data[index_tri]['full_text'] = remove_twitter_urls(data[index_tri]['full_text'])
    data[index_tri]['full_text'] = remove_emoticons(data[index_tri]['full_text'])
    data[index_tri]['full_text'] = remove_emoji(data[index_tri]['full_text'])
    data[index_tri]['full_text'] = give_emoji_free_text(data[index_tri]['full_text'])
    data[index_tri]['full_text'] = noamp(data[index_tri]['full_text'])                 # No amp with space
    data[index_tri]['full_text'] = re.sub("#\S+", " ",  data[index_tri]['full_text'])  # Remove hashtags
    data[index_tri]['full_text'] = re.sub("@\S+", " ",  data[index_tri]['full_text'])  # No mentions
    data[index_tri]['full_text'] = data[index_tri]['full_text'].translate(str.maketrans('', '', string.punctuation)) # No puntuaction
    data[index_tri]['full_text'] = data[index_tri]['full_text'].encode('ascii', 'ignore').decode() # No unicode
    data[index_tri]['full_text'] = re.sub("^rt ", " ", data[index_tri]['full_text'])   # No RT
    data[index_tri]['full_text'] = re.sub(r'\b\d\b', lambda x: num2words.num2words(int(x.group(0))), data[index_tri]['full_text'])
    data[index_tri]['full_text'] = re.sub('\s{2,}', " ", data[index_tri]['full_text']) # Remove big spaces
    trigram_tokens=list(nltk.trigrams(nltk.word_tokenize(data[index_tri]['full_text'])))
    #print(trigram_tokens)
    clean_trigram_tokens = [gram for gram in trigram_tokens if not any(stop in gram for stop in stop_words_tri)]
    for c in clean_trigram_tokens:
        final_token = ' '.join(c) + " " +lista_unica_csv[indice_csv+1].lower().replace(" ", "")
        #print(final_token)
        new_trigram.append(final_token)
    index_tri=index_tri+1

The dictionary is created:

In [7]:
fdist_tri = dict(nltk.FreqDist(new_trigram))

#fdist_bi = dict(nltk.FreqDist(new_bigram))
#print(fdist_bi)


dft = pd.DataFrame.from_dict(fdist_tri, orient='index').reset_index()
dft = dft.rename(columns={'index':'Trigrams', 0:'Count'})
col_one_list = dft['Trigrams'].tolist()
col_two_list = dft['Count'].tolist()
typelist=[]
namelist=[]

index = 0

count_false = [0] * len(col_one_list)
count_part = [0] * len(col_one_list)

for el in col_one_list:
    tok = el.split()
    a = tok[0] + " " + tok[1] + " " + tok[2]
    namelist.append(a)
    if a in namelist:
        indx = namelist.index(a)
        if tok[3] == "false":
            count_false[indx] = col_two_list[index]
        elif tok[3] == "partiallyfalse":
            count_part[indx] = col_two_list[index]
        else:
            print("errore count")
    index = index + 1

i=0
for el in col_two_list:
    col_two_list[i] = count_false[i] + count_part[i]
    i = i + 1

dft['Trigrams']=namelist
dft['False']=count_false
dft['Partially False']=count_part
dft['Count']=col_two_list

dft = dft.sort_values(by=['Count'],ascending=[False])


range_ = ["#40B0A6","#E1BE6A"]


bars = alt.Chart(dft).transform_fold(
    ['False', 'Partially False']
).mark_bar().encode(
    x=alt.X('key:N',title=None),
    y=alt.Y('value:Q',title="Tweet count"),
    color=alt.Color('key:N', scale=alt.Scale(range=range_),title="Category"),
    column="Trigrams:N"
).transform_filter(
    #{"and": [alt.FieldGTPredicate(field='False', gt=1), alt.FieldGTPredicate(field='Partially False', gt=0)]}
     alt.FieldRangePredicate(field='Count', range=[3, 6])
).properties(
    title="The most frequent trigrams classified by category (False, Partually False)", 
    width=150
).configure_title(
    fontSize=17,
    offset=25
).configure_axis(
    labelFontSize=13,
    titleFontSize=15,
    titlePadding=15
).configure_legend(
    titleFontSize=14,
    labelFontSize=12,
    titlePadding=10
).configure_header(
    titleFontSize=15,
    labelFontSize=12
)

bars

                             Trigrams  Count  False  Partially False
472                   fake news alert      6      6                0
2010        world health organisation      5      3                2
573       algerian medical delegation      3      3                0
2462                        let us go      3      1                2
2012  organisation protocol procedure      3      1                2
2011     health organisation protocol      3      1                2
924          new coronavirus 2019ncov      3      3                0
574           medical delegation sent      3      3                0
3071              unleashed 500 lions      2      2                0
3342        presented chinas ministry      2      2                0
3340                 dr hala minister      2      2                0
3339       china officially announced      2      2                0
3337                  hours ago china      2      2                0
1626              health care work