In [60]:
import os 
from collections import Counter 
import re 
from shutil import copyfile 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction import text 
import pandas as pd 
import html
from nltk.stem.snowball import SnowballStemmer
import spacy
import en_core_web_sm

In [127]:
tokens = ['humanitarian', 'humanitarianism']
covid_tokens = ['covid','covid19', 'coronavirus', 'virus','lockdown', 'isolate']
covid_token2 = ['covid','covid19', 'coronavirus', 'virus']

root = 'Raw text/'
years = ['2019','2020']
year_month = ['201912','202001','202002','202003','202004','202005','202006','202007','202008',]
countries = ['CN','IR','QA','RU','TR','US','UK','DE','FR','AE','SA','KW']
patterns = [
   (r'won\'t', 'will not'),
   (r'can\'t', 'cannot'),
   (r'i\'m', 'i am'),
   (r'(\w+)\'ll', '\g<1> will'),
   (r'(\w+)n\'t', '\g<1> not'),
   (r'(\w+)\'ve', '\g<1> have'),
   (r'(\w+)\'s', '\g<1> is'),
   (r'(\w+)\'re', '\g<1> are'),
]

#### 1. Cleaning/Normalization Function

In [66]:
def countWords(text, patterns, wordsToCount):
    text = text.lower() # Text normalization: make string lowercase
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE) #Remove URLs
    text = html.unescape(text)   #Remove HTML tags
    patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    for (pattern, repl) in patterns:
        text = re.sub(pattern, repl, text)    #Replace contractions
    text = re.sub(r'[^\w\s]','', text) # Text normalization: remove punctuation
    
    splitString = text.split() # Split string into array of words
    counts = Counter(splitString) # Get counts for each word like Counter({'dogs': 3, 'cute': 1})
    count = 0 # Start the counter
    for word in wordsToCount: # Loop through list of words and add the count
        count = count + counts[word]
        print()
    return count

#### Extracting covid relevant articles

In [69]:
def cleanOnWordCount(country, tokens, covid_tokens, root, patterns, year_month):
    print("Processing", country, "...")
    # Directory declarations
    myCorpusRoot = str(root) + str(country)
    myCorpusRootClean = str(root) + str(country) + 'Clean'
    # Create a directory if it doesn't exist
    if not os.path.exists(myCorpusRootClean):
        os.makedirs(myCorpusRootClean) 
    filteredFiles = 0
    
    for root,dirs,files in os.walk(myCorpusRoot):
        for f in files:
            fileRoot = os.path.join(root,f)
            filename = f
            fileOpen = open(fileRoot,"rt")
            fileText = fileOpen.read()
            fileTextCount = countWords(fileText,patterns, tokens)
            fileTextCount2 = countWords(fileText,patterns,covid_tokens)
            if filename[0:6] in year_month:
                if(fileTextCount > 0 and fileTextCount2 > 2):
                    filteredFiles = filteredFiles + 1
                    copyfile(fileRoot,str(myCorpusRootClean)+'/'+str(filename))
                    
    print('Copied', filteredFiles, 'to the new directory for the country', country)

In [70]:
for country in countries:
    cleanOnWordCount(country, tokens, covid_tokens, root, patterns, year_month)

Processing CN ...
Copied 1192 to the new directory for the country CN
Processing IR ...
Copied 1328 to the new directory for the country IR
Processing QA ...
Copied 575 to the new directory for the country QA
Processing RU ...
Copied 1069 to the new directory for the country RU
Processing TR ...
Copied 428 to the new directory for the country TR
Processing US ...
Copied 3752 to the new directory for the country US
Processing UK ...
Copied 2737 to the new directory for the country UK
Processing DE ...
Copied 255 to the new directory for the country DE
Processing FR ...
Copied 601 to the new directory for the country FR
Processing AE ...
Copied 966 to the new directory for the country AE
Processing SA ...
Copied 548 to the new directory for the country SA
Processing KW ...
Copied 61 to the new directory for the country KW


#### Normalization and stemming the text

In [71]:
def textClean(text,root,patterns):
    text = text.lower() # Text normalization: make string lowercase
    s_stemmer = SnowballStemmer(language='english')
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE) #Remove URLs
    text = html.unescape(text)   #Remove HTML tags
    patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    for (pattern, repl) in patterns:
        text = re.sub(pattern, repl, text)    #Replace contractions
    text = re.sub(r'[^\w\s]','', text) # Text normalization: remove punctuation
    
    words = text.split() # Split string into array of words
    for word in words:
        word = s_stemmer.stem(word)
    cleantext = ' '.join(words)
    
    return cleantext

#### Output clean dataframe

In [131]:
def getCountryDataFrame(country, covid_tokens, root, patterns):
    print("Working...")
    myCorpusRootClean = str(root) + str(country) + 'Clean'
    filteredFiles = 0
    dfList = []
    
    for root,dirs,files in os.walk(myCorpusRootClean):
        for f in files:
            dfListLib = {}
            fileRoot = os.path.join(root,f)
            filename = f
            t = filename[11:]
            name_regex = re.compile(r'_[a-z]+',re.RegexFlag.IGNORECASE)
            matches = name_regex.findall(t)
            if matches != []:
                network = matches[0]
                network = network.replace('_','')

            fileOpen = open(fileRoot,"rt")
            fileText = fileOpen.read()
            
            cleanText = textClean(fileText, fileRoot, patterns)
            
            fileTextCount = countWords(fileText,patterns,covid_tokens)
            if fileTextCount > 1:
                dfListLib['name'] = filename
                dfListLib['path'] = fileRoot
                dfListLib['country'] = country
                dfListLib['network'] = network
                dfListLib['date'] = filename[0:8]
                dfListLib['token_freq'] = fileTextCount
                dfListLib['text'] = cleanText
                dfList.append(dfListLib)
    print(country + ' dataframe created!')
    return dfList

In [132]:
dfCN = pd.DataFrame(getCountryDataFrame('CN', covid_token2, root, patterns))
dfIR = pd.DataFrame(getCountryDataFrame('IR', covid_token2, root, patterns))
dfQA = pd.DataFrame(getCountryDataFrame('QA', covid_token2, root, patterns))
dfRU = pd.DataFrame(getCountryDataFrame('RU', covid_token2, root, patterns))
dfTR = pd.DataFrame(getCountryDataFrame('TR', covid_token2, root, patterns))
dfSA = pd.DataFrame(getCountryDataFrame('SA', covid_token2, root, patterns))
dfUS = pd.DataFrame(getCountryDataFrame('US', covid_token2, root, patterns))
dfUK = pd.DataFrame(getCountryDataFrame('UK', covid_token2, root, patterns))
dfDE = pd.DataFrame(getCountryDataFrame('DE', covid_token2, root, patterns))
dfFR = pd.DataFrame(getCountryDataFrame('FR', covid_token2, root, patterns))
dfAE = pd.DataFrame(getCountryDataFrame('AE', covid_token2, root, patterns))
dfKW = pd.DataFrame(getCountryDataFrame('KW', covid_token2, root, patterns))

Working...
CN dataframe created!
Working...
IR dataframe created!
Working...
QA dataframe created!
Working...
RU dataframe created!
Working...
TR dataframe created!
Working...
SA dataframe created!
Working...
US dataframe created!
Working...
UK dataframe created!
Working...
DE dataframe created!
Working...
FR dataframe created!
Working...
AE dataframe created!
Working...
KW dataframe created!


In [133]:
frames = [dfCN, dfIR, dfQA, dfRU, dfTR, dfSA, dfUS, dfUK, dfDE, dfFR, dfAE, dfKW]
df = pd.concat(frames)

In [42]:
df.to_csv('covid_clean_df.csv', index=False)

In [143]:
ordered_df = df.sort_values(by="date")
ordered_df.reset_index(drop=True, inplace=True) #Reset the index
ordered_df

Unnamed: 0,name,path,country,network,date,token_freq,text
0,20191201_FR_RFI_GDELT260769.txt,Raw text/FRClean/20191201_FR_RFI_GDELT260769.txt,FR,RFI,20191201,4,an hivaids awareness campaign on the eve of wo...
1,20191204_AE_KhaleejTimes_GDELT75493.txt,Raw text/AEClean/20191204_AE_KhaleejTimes_GDEL...,AE,KhaleejTimes,20191204,16,wknd inspired living kt home videos interactiv...
2,20191214_TR_AnadoluAgency_NEXIS720304.txt,Raw text/TRClean/20191214_TR_AnadoluAgency_NEX...,TR,AnadoluAgency,20191214,4,virus transmitted to people from wild animals ...
3,20191218_US_USAToday_GNAPI57390.txt,Raw text/USClean/20191218_US_USAToday_GNAPI573...,US,USAToday,20191218,7,alabama mobile researchers from the university...
4,20191218_US_TheNewHumanitarian_GNAPI57383.txt,Raw text/USClean/20191218_US_TheNewHumanitaria...,US,TheNewHumanitarian,20191218,4,the humanitarian sector has a trust problem th...
...,...,...,...,...,...,...,...
13426,20200819_GB_DailyMail_NEXIS322650.txt,Raw text/UKClean/20200819_GB_DailyMail_NEXIS32...,UK,DailyMail,20200819,16,jill biden shrugged of president trump is atta...
13427,20200819_KW_KUNA_GDELT162457.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT162457.txt,KW,KUNA,20200819,6,loc23272027 gmt kuwait april 23 kuna un secret...
13428,20200819_KW_KUNA_GDELT169766.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT169766.txt,KW,KUNA,20200819,3,loc15121212 gmt rome april 26 kuna the kuwaiti...
13429,20200819_KW_KUNA_GDELT174322.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT174322.txt,KW,KUNA,20200819,5,loc02532353 gmt kuwait march 6 kuna the kuwait...


In [144]:
ordered_df.to_csv('covid_cleandf.csv', index=False)