In [1]:
import nltk
import json
import pycountry
import re
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from nltk.corpus import stopwords

### Loading the dataset


In [2]:
def load_tweets():
    path = 'dataset_tweets_WHO.txt'
    tweets = []
    with open(path,"r",encoding='utf-8') as fp:
        for line in fp:
            tweet_ = json.loads(line)
            for key in tweet_.keys():
                tweet = []
                tweet.append(key)
                tweet.append(tweet_[key]['full_text'])
                tweet.append(tweet_[key]['lang'])
                tweets.append(tweet)
    return tweets

In [3]:
def remove_stopwords(string,language):
    #we build the stopwords set depending on the language of the tweet.
    stop_words = set(stopwords.words(language))         
    return [word for word in string if word not in stop_words]


def stemming(string,language):
    porter_stemmer = PorterStemmer()
    if language in SnowballStemmer.languages:
        snowball_stemmer = SnowballStemmer(language)
        return [snowball_stemmer.stem(word) for word in string]
    else:
        return [porter_stemmer.stem(word) for word in string]

def clean(string,language):
    
    string = string.lower()
    string = string.split() #tokenize
    string = [re.sub("[^a-z0-9#@]","",word) for word in string] #we remove everything except words, numbers # and @.
    string = [word for word in string if word != ''] # we delete the token that are empty.
    
    if len(language) == 2:
            language = pycountry.languages.get(alpha_2=language).name.lower()
    elif len(language) == 3:
            language = pycountry.languages.get(alpha_3=language).name.lower()
            
    string = remove_stopwords(string,language)  #remove stopwords
    string = stemming(string,language)   #stemming

    return string

In [4]:
tweets = load_tweets()

filtered_tweets = []
for tweet in tweets:
    try:
        filtered_tweets.append([tweet[0],clean(tweet[1],tweet[2]),tweet[2]])
    except:
        pass

In [5]:
for tweet in filtered_tweets:
    print(tweet)

['0', ['intern', 'day', 'disast', 'risk', 'reduct', '#openwho', 'launch', 'multiti', 'core', 'curriculum', 'help', 'equip', 'compet', 'need', 'work', 'within', 'public', 'health', 'emerg', 'respons', 'start', 'learn', 'today', 'amp', '#ready4respons', 'httpstcohbffof0xkl', 'httpstcofgzy22rwus'], 'en']
['1', ['#covid19', 'shown', 'health', 'emerg', 'disast', 'affect', 'entir', 'communiti', 'especi', 'weak', 'health', 'system', 'vulner', 'popul', 'like', 'migrant', 'indigen', 'peopl', 'live', 'fragil', 'humanitarian', 'condit', 'httpstcojpuqpnu0v1'], 'en']
['2', ['intern', 'day', 'disast', 'risk', 'reduct', 'better', 'respond', 'emerg', 'countri', 'must', 'invest', 'health', 'care', 'system', 'achiev', 'gender', 'equiti', 'protect', 'marginalis', 'group', 'ensur', 'readi', 'amp', 'equit', 'access', 'suppli', 'strong', 'amp', 'resili', 'health', 'system', 'httpstco5nalyjiymp'], 'en']
['3', ['rt', '@whoafro', 'congratul', 'algeria', '#algeria', '16th', 'countri', '#africa', 'reach', 'miles

['2273', ['congratul', '#china', 'certifi', '#malariafre', 'remark', 'achiev', 'come', '70', 'yearslong', 'fight', 'diseas', 'world', 'step', 'closer', '#endmalaria', 'httpstcogfrx7ptcvg', 'httpstcoin6pnqbkxa'], 'en']
['2274', ['avoid', 'consum', 'much', 'sugar', 'recommend', '6', 'level', 'teaspoon', 'free', 'sugar', 'day', 'includ', 'sugar', 'ad', 'process', 'food', 'like', 'breakfast', 'cereal', 'sauc', 'sugar', 'snack', 'sweeten', 'drink', '#selfcar', 'month', 'httpstco4pqessoxcq'], 'en']
['2275', ['femal', 'genit', 'mutil', 'harm', 'practic', 'extrem', 'form', '#gender', 'discrimin', 'violat', '#humanright', 'health', 'benefit', 'support', 'effort', '#endfgm', 'httpstcozgcpffxwop', 'httpstcoyx8rnk6erk'], 'en']
['2276', ['#liberia', 'organ', 'interact', 'event', '@ukinliberia', 'fight', 'misinform', 'donat', 'essenti', 'medicin', 'suppli', 'control', 'leprosi', 'enhanc', 'ebola', 'prepared', 'strengthen', 'countri', '#covid19', 'respons', 'httpstco3um2y9qdss', '#whoimpact', 'httpst