In [1]:
from sentence_transformers import SentenceTransformer
import os
import re
import codecs
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import datetime
from scipy.spatial.distance import cosine


%matplotlib inline

# !pip install transformers
# !pip install sentence_transformers

In [2]:
base_dir = os.path.join(os.getcwd(), 'data', 'clean') 
countries_list = os.listdir(base_dir) # dir is your directory path
summary = []

for con in countries_list:
    if(os.path.isdir(os.path.join(base_dir,con))):
        print(con, end=", ", flush=True)
        publishers = os.listdir(os.path.join(base_dir,con))
        for pub in publishers:
            subdir = os.path.join(base_dir, con, pub)
            length = sum([len(files) for r, d, files in os.walk(subdir)])
            summary.append([con, pub, length])

summary_df = pd.DataFrame(summary, columns = ['country','publisher','count'])     

PK, US, HK, GH, NG, KE, NZ, BD, IN, LK, CA, ZA, SG, PH, GB, MY, AU, IE, JM, TZ, 

In [3]:
# summary_df

In [4]:
idx = summary_df.groupby(['country'])['count'].transform(max) == summary_df['count']
max_table = summary_df[idx]
max_table

Unnamed: 0,country,publisher,count
202,PK,the-express-tribune,6555
6085,US,npr,6879
13877,HK,ej-insight,2607
14167,GH,ghanaweb,4299
14370,NG,vanguard,3989
15048,KE,freshplaza,4137
15502,NZ,stuff-co-nz,7273
15591,BD,bangladesh-news-24-hours,2786
16411,IN,times-of-india,5557
17300,LK,hiru-news,2627


In [5]:
overall_dict = {}

for index, row in max_table.iterrows():
    country, publisher = row[0], row[1]
    print(country, end=", ", flush=True)
    directory = os.path.join(base_dir, country, publisher)
    df_dict = {"id": [], "title": [], "publisher": [], "article_text":[],"url":[], "path":[]}

    for entry in os.scandir(directory):
        if(os.path.isdir(entry)):
            for entry_2 in os.scandir(entry.path):
                try:
                    with open(entry_2, "r") as f:
                        article_id = f.readline().strip()
                        article_title = f.readline().strip()
                        publisher = f.readline().strip()
                        url = f.readline()
                        f.readline()
                        article_text = f.readline().strip()
                        df_dict['url'].append(url)
                        df_dict['id'].append(article_id)
                        df_dict['title'].append(article_title)
                        df_dict['publisher'].append(publisher)
                        df_dict['article_text'].append(article_text)
                        df_dict['path'].append(entry_2)
                except:
                    pass
    sample_df = pd.DataFrame.from_dict(df_dict)
    sample_df = sample_df.sample(n=250, random_state=1)
    overall_dict[country] = sample_df

PK, US, HK, GH, NG, KE, NZ, BD, IN, LK, CA, ZA, SG, PH, GB, MY, AU, IE, JM, TZ, 

## Preprocessing


In [6]:
# import a clean function data
def clean_text(text):
    return text.strip()

def text_initial_preproecss(df):
    df = df.copy()
    df = df[~pd.to_numeric(df['id'], errors='coerce').isnull()]
    df["id"] = df["id"].astype(int)
    df['article_text'] = df.article_text.str.lower()
    df["article_text"] = df["article_text"].apply(clean_text)
    df['text'] = df['article_text']
    reports = df
    
    # Puncutation preprocesing
    reports['text'] = reports.text.str.replace('{', '')
    reports['text'] = reports.text.str.replace('}', '')
    reports['text'] = reports.text.str.replace("\n", '')
    reports['text'] = reports.text.str.rstrip("\n") #remove empty lines
    reports['text'] = reports.text.str.replace("@ @ @ @ @ @ @ @ @ @ ", '')
    reports['text'] = reports.text.str.replace(" @", '')
    reports['text'] = reports.text.str.replace(" '", "'")
    reports['text'] = reports.text.str.replace("\"", "")
    reports['text'] = reports.text.str.replace(",", "")
    reports['text'] = reports.text.str.replace("(", "")
    reports['text'] = reports.text.str.replace(")", "")
    reports['text'] = reports.text.str.replace(" <p>", ".")
    reports['text'] = reports.text.str.replace(" <h>", ".")
    reports['text'] = reports.text.str.replace("<p>", "")
    reports['text'] = reports.text.str.replace("<h>", "")
    reports['text'] = reports.text.str.replace('<', '')
    reports['text'] = reports.text.str.replace('>', '')
    reports['text'] = reports.text.str.replace(":", "")
    reports['text'] = reports.text.str.replace("?", ".")
    reports['text'] = reports.text.str.replace("!", ".")
    reports['text'] = reports.text.str.replace(r"\.\s[\.\s]+", ". ") #converting . . to .
    reports['text'] = reports.text.str.replace(r"\.+", ".") #converting ... to .
    reports['text'] = reports.text.str.replace("--", "") 
    reports['text'] = reports.text.str.replace("-", " ")
    reports['text'] = reports.text.str.replace(" +", " ")
    reports['text'] = reports.text.str.replace(" n't", "n't")
    
    return reports.copy()

In [7]:
import platform
os_name = platform.system()

def save_file(report,base):
#     print(report)
#     print(type(report))
    report = report.squeeze()
        
    full_path = report.publisher
    file_name = str(report.id) + ".txt"
    
    if os_name == 'Windows':
        base_dir = 'Process2\\{}'.format(base)
        con = report.path.path.split('clean\\')[1]
        
    else:
        
        base_dir = 'Process2/{}'.format(base)
        con = report.path.path.split('/clean/')[1]
        
    path = os.path.join(base_dir,con)
        
    full_path = os.path.dirname(path)
    file_name = os.path.basename(path)
    
    pathlib.Path(full_path).mkdir(parents=True, exist_ok=True)
    with codecs.open(f"{full_path}/{file_name}", "w", encoding = "utf-8") as f:
        id_number = str(report.id) if not pd.isna(report.id) else ""
        title = report.title if not pd.isna(report.title) else ""
        website = report.publisher if not pd.isna(report.publisher) else ""
        url = report.url if not pd.isna(report.url) else ""
        f.writelines([id_number, "\n",
                      title, "\n",
                      website, "\n",
                      url, "\n\n", report.text])
        f.close()

In [8]:
def ngram_preprocess(df):
    time_taken = []
    
#     reports_temp = df[df['publisher']==article][['id', 'text','text_vect']].copy()
    reports_temp = df.copy()
    

    reports_temp['text_vect'] = reports_temp.text.apply(sent_tokenize)
#     display(reports_temp.head(3))
    word_vectorizer = CountVectorizer(ngram_range=(5,5), stop_words=[])
    flatten = [item for sublist in reports_temp['text_vect'] for item in sublist]
    sparse_matrix = word_vectorizer.fit_transform(flatten)

    frequency = sum(sparse_matrix).toarray()[0]
    frequency_df = pd.DataFrame(frequency, index=word_vectorizer.get_feature_names(),columns = ['frequency']).sort_values(by=['frequency'],ascending=False)

    freq_above_10 = frequency_df[frequency_df['frequency'] >=len(reports_temp)/4]
    phrase_list = list(freq_above_10.index)


    sentences_to_remove = []

    for phrase in phrase_list:
        removing_sent = set([sent for sent in flatten if ((phrase in sent) or (len(sent.split()) < 4))])

        for sent in removing_sent:
            sentences_to_remove.append(sent)

    if("." in sentences_to_remove):
        while("." in sentences_to_remove):
            sentences_to_remove.remove(".")
    if(" ." in sentences_to_remove):
        while(" ." in sentences_to_remove):
            sentences_to_remove.remove(" .")

#    print(sentences_to_remove)
    sentences_to_remove = list(set(sentences_to_remove))
    sentences_to_remove.sort(key= len, reverse = True)  # sort starting by largest sentence, in case smaller sentence get chosen beforehand

    for sent in sentences_to_remove:
        reports_temp.text = reports_temp.text.apply(lambda x: str(x).replace(sent, ""))

    
    return reports_temp

In [9]:
def cosine_preprocess(df, model, threshold):

    sent_removed = []
    time_taken = []
    new_txts = []
    
    for i in range(len(df)):
        
        idx_remove = []
        orig_txt = df['text'].iloc[i]
        sent_text = nltk.sent_tokenize(orig_txt) #Sentence Segmentation
        docu_embeddings = model.encode(orig_txt) #Document Embedding
        sentence_embeddings_ = model.encode(sent_text) #Sentence Embedding

        for k in range(len(sentence_embeddings_)):
            
            sim = cosine(docu_embeddings, sentence_embeddings_[k])

            if sim > threshold:
                idx_remove.append(k)

        idx_remove.sort()
        idx_remove.reverse()

        for j in idx_remove:
            sent_removed.append(sent_text[j])
            del sent_text[j]

        new_txt = ' '.join(sent_text)
        new_txts.append(new_txt)
        
    df['text'] = new_txts

    return df

## Saving original file, to do comparison later

In [10]:
for keys in overall_dict.keys():
    df = overall_dict[keys]
    df = text_initial_preproecss(df)
    df.apply(save_file,args=('orig',),axis=1)

## N-Gram

In [11]:
cosine_process = False
N_gram_process = True
merged_process = True if cosine_process and N_gram_process else False
cosine_threshold=0.95

time_taken = []
text_list = []

if merged_process:
    base='merged'
elif cosine_process:
    base='cosine'
else:
    base='ngram'

print("Methodology: {}".format(base))
  
for keys in overall_dict.keys(): # i.e. 'US', "CA"
    
    a = datetime.datetime.now()
    
    df = overall_dict[keys]   
    df = text_initial_preproecss(df)
    
    b = datetime.datetime.now()
    
#     df.apply(save_file_initial,args = (base,),axis=1)

#     text_list.append(df.iloc[1])
    if merged_process:
        df = ngram_preprocess(df)
        df = cosine_preprocess(df,model,cosine_threshold)

    elif cosine_process:
        df = cosine_preprocess(df,model,cosine_threshold)

    elif N_gram_process:
        df = ngram_preprocess(df)

    
    #saving files
    df.apply(save_file,args = (base,),axis=1)

    c = datetime.datetime.now()
    
    print("Running time, country: {}, punct_t: {}, text_proc_t: {}, total_t: {}".format(keys, b-a, c-b, c-a))
    time_taken.append(c-a)

print(np.mean(time_taken))
print(np.sum(time_taken))

Methodology: ngram
Running time, country: PK, punct_t: 0:00:00.028252, text_proc_t: 0:00:00.161585, total_t: 0:00:00.189837
Running time, country: US, punct_t: 0:00:00.018529, text_proc_t: 0:00:00.129167, total_t: 0:00:00.147696
Running time, country: HK, punct_t: 0:00:00.020740, text_proc_t: 0:00:00.090714, total_t: 0:00:00.111454
Running time, country: GH, punct_t: 0:00:00.020756, text_proc_t: 0:00:00.139624, total_t: 0:00:00.160380
Running time, country: NG, punct_t: 0:00:00.030246, text_proc_t: 0:00:00.182570, total_t: 0:00:00.212816
Running time, country: KE, punct_t: 0:00:00.032731, text_proc_t: 0:00:00.074514, total_t: 0:00:00.107245
Running time, country: NZ, punct_t: 0:00:00.028170, text_proc_t: 0:00:00.103434, total_t: 0:00:00.131604
Running time, country: BD, punct_t: 0:00:00.095937, text_proc_t: 0:00:00.163049, total_t: 0:00:00.258986
Running time, country: IN, punct_t: 0:00:00.047109, text_proc_t: 0:00:00.338201, total_t: 0:00:00.385310
Running time, country: LK, punct_t: 

## Cosine

In [12]:
cosine_process = True
N_gram_process = False
merged_process = True if cosine_process and N_gram_process else False
cosine_threshold=0.95

model = SentenceTransformer('sentence-transformers/bert-base-nli-stsb-mean-tokens')

time_taken = []
text_list = []

if merged_process:
    base='merged'
elif cosine_process:
    base='cosine'
else:
    base='ngram'

print("Methodology: {}".format(base))
  
for keys in overall_dict.keys(): # i.e. 'US', "CA"
    
    a = datetime.datetime.now()
    
    df = overall_dict[keys]   
    df = text_initial_preproecss(df)
    
    b = datetime.datetime.now()
    
#     df.apply(save_file_initial,args = (base,),axis=1)

#     text_list.append(df.iloc[1])
    if merged_process:
        df = ngram_preprocess(df)
        df = cosine_preprocess(df,model,cosine_threshold)

    elif cosine_process:
        df = cosine_preprocess(df,model,cosine_threshold)

    elif N_gram_process:
        df = ngram_preprocess(df)

    
    #saving files
    df.apply(save_file,args = (base,),axis=1)

    c = datetime.datetime.now()
    
    print("Running time, country: {}, punct_t: {}, text_proc_t: {}, total_t: {}".format(keys, b-a, c-b, c-a))
    time_taken.append(c-a)

print(np.mean(time_taken))
print(np.sum(time_taken))

Exception when trying to download https://sbert.net/models/sentence-transformers/bert-base-nli-stsb-mean-tokens.zip. Response 404


Methodology: cosine
Running time, country: PK, punct_t: 0:00:00.025535, text_proc_t: 0:00:29.980857, total_t: 0:00:30.006392
Running time, country: US, punct_t: 0:00:00.033244, text_proc_t: 0:00:21.887256, total_t: 0:00:21.920500
Running time, country: HK, punct_t: 0:00:00.042379, text_proc_t: 0:00:20.013417, total_t: 0:00:20.055796
Running time, country: GH, punct_t: 0:00:00.056402, text_proc_t: 0:00:18.373036, total_t: 0:00:18.429438
Running time, country: NG, punct_t: 0:00:00.038770, text_proc_t: 0:00:19.092112, total_t: 0:00:19.130882
Running time, country: KE, punct_t: 0:00:00.043322, text_proc_t: 0:00:12.386265, total_t: 0:00:12.429587
Running time, country: NZ, punct_t: 0:00:00.033503, text_proc_t: 0:00:14.561457, total_t: 0:00:14.594960
Running time, country: BD, punct_t: 0:00:00.031834, text_proc_t: 0:00:09.279376, total_t: 0:00:09.311210
Running time, country: IN, punct_t: 0:00:00.047133, text_proc_t: 0:00:26.028671, total_t: 0:00:26.075804
Running time, country: LK, punct_t:

## Merged

In [None]:
cosine_process = True
N_gram_process = True
merged_process = True if cosine_process and N_gram_process else False
cosine_threshold=0.95

model = SentenceTransformer('sentence-transformers/bert-base-nli-stsb-mean-tokens')

time_taken = []
text_list = []

if merged_process:
    base='merged'
elif cosine_process:
    base='cosine'
else:
    base='ngram'

print("Methodology: {}".format(base))
  
for keys in overall_dict.keys(): # i.e. 'US', "CA"
    
    a = datetime.datetime.now()
    
    df = overall_dict[keys]   
    df = text_initial_preproecss(df)
    
    b = datetime.datetime.now()
    
#     df.apply(save_file_initial,args = (base,),axis=1)

#     text_list.append(df.iloc[1])
    if merged_process:
        df = ngram_preprocess(df)
        df = cosine_preprocess(df,model,cosine_threshold)

    elif cosine_process:
        df = cosine_preprocess(df,model,cosine_threshold)

    elif N_gram_process:
        df = ngram_preprocess(df)

    
    #saving files
    df.apply(save_file,args = (base,),axis=1)

    c = datetime.datetime.now()
    
    print("Running time, country: {}, punct_t: {}, text_proc_t: {}, total_t: {}".format(keys, b-a, c-b, c-a))
    time_taken.append(c-a)

print(np.mean(time_taken))
print(np.sum(time_taken))

Exception when trying to download https://sbert.net/models/sentence-transformers/bert-base-nli-stsb-mean-tokens.zip. Response 404


Methodology: merged
Running time, country: PK, punct_t: 0:00:00.043334, text_proc_t: 0:00:51.024576, total_t: 0:00:51.067910
Running time, country: US, punct_t: 0:00:00.050589, text_proc_t: 0:00:34.143380, total_t: 0:00:34.193969
Running time, country: HK, punct_t: 0:00:00.042564, text_proc_t: 0:00:30.855597, total_t: 0:00:30.898161
Running time, country: GH, punct_t: 0:00:00.037785, text_proc_t: 0:00:22.952209, total_t: 0:00:22.989994
Running time, country: NG, punct_t: 0:00:00.060841, text_proc_t: 0:00:22.611507, total_t: 0:00:22.672348
Running time, country: KE, punct_t: 0:00:00.049623, text_proc_t: 0:00:17.964988, total_t: 0:00:18.014611
Running time, country: NZ, punct_t: 0:00:00.060705, text_proc_t: 0:00:18.633917, total_t: 0:00:18.694622
