In [1]:
#Generic
from collections import Counter
import os
from os.path import join
import matplotlib.pyplot as plt
%matplotlib inline
import re
import numpy as np
import pandas as pd
from pprint import pprint
import pickle
import itertools
import datetime
from datetime import timedelta


#NLTK Stopword List
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('arabic')

#Gensim (LDA-Modelling)
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import LdaMulticore
from gensim.test.utils import datapath

#PDF-Reader
import pdfplumber

import pyLDAvis.gensim_models
import xml.etree.ElementTree as ETree

# plots
from bidi.algorithm import get_display
import arabic_reshaper
import matplotlib.pyplot as plt

In [2]:
#file_path = 'read_pdfs'
file_path = 'Al-Naba'

  and should_run_async(code)


In [3]:
def prepare_comp_corpus(corpus_path="Alittihad_XML_utf_8/Alittihad_utf_8.xml"):
    # read xml
    parser = ETree.XMLParser(encoding='UTF-8')
    tree = ETree.parse(corpus_path, parser=parser)
    xroot = tree.getroot()

    # extract texts
    texts = []
    for node in xroot:
        texts.append(node.find("Text").text)

    # check for text duplicates
    unique_texts_with_none = list(set(texts))
    unique_texts = []
    for ele in filter(None, unique_texts_with_none): # remove None
        unique_texts.append(ele)

    # join all texts and do naive tokenization
    words = ' '.join(unique_texts).split()

    # keep only unique words
    words = list(set(words))
    print('Unique words: ' + str(len(words)))

    # remove punct
    for pattern in [r"\W", r"\d"]:#[r"\[", r"\]", r"\(", r"\)", r"\|", r"/", r"\.", r"\:", r"\«", r"\"", r"\»", r"\'", r"\d", r"\%"]: 
        words = [re.sub(pattern, "", word) for word in words]
    return words

def get_text(file_path):
    data_list = []
    pages_list = []
    file_list = []

    for index, filename in enumerate(os.listdir(file_path)):
        if filename.endswith(".pdf"):
            with pdfplumber.open(join(file_path,filename)) as stream_pdf:
                pages_content = [i.extract_text() for i in stream_pdf.pages]
                number_pages = len(pages_content)
                data_list.extend(pages_content)
                pages_list.extend(i+1 for i in range(number_pages))
                file_list.extend([filename] * number_pages)
                
    data_df = pd.DataFrame({'content': data_list,
                            'page': pages_list,
                            'file': file_list,
                            'date': np.nan})
    return data_df

# add handselected words to the comparence dict
def load_handselected_words(path):
    with open(path, 'r', encoding='utf-8') as f:
        read_in = f.readlines()
    res = []
    for entry in read_in:
        if ',' in entry:
            res.append(entry.split(',')[0].split('\\')[0])
    return res

#283 issues, missing issue no. 85
def insert_date(df, start_date=datetime.date(2015, 10, 16)):
    res_list = []
    for row_count in range(df['date'].shape[0]):
        issue_no = int(df['file'][row_count][:-4].split('-')[-1]) # get issue no
        if issue_no == 1:
            res_list.append(start_date) # start equals date of first issue
        else:
            res_list.append(start_date + timedelta(days=(issue_no-1)*7)) # add the passed days to the start date using the issue number
    df['date'] = res_list
    return df

def preproc_pagewise(data_df, comp_list, all_words_to_be_processed):
    # preprocess all sentences pagewise
    pages = []
    for issue in set(data_df['file'].values):
        for page in data_df['content'][data_df['file'] == issue].values:
            page_dat = [''.join(page)]
            
            # Remove Emails
            page_dat = [re.sub(r'\S*@\S*\s?', '', sent) for sent in page_dat]
            
            # Remove new line characters
            page_dat = [re.sub(r'\s+', ' ', sent) for sent in page_dat]
            page_dat = [re.sub(r'\n', ' ', sent) for sent in page_dat]

            # Remove distracting single quotes
            page_dat = [re.sub(r"\'", "", sent) for sent in page_dat]
            page_dat = [re.sub(r"/", "", sent) for sent in page_dat]

            # Remove Numbers
            for i in range(10):
                page_dat = [re.sub(str(i), "", sent) for sent in page_dat]

            # Remove Brackets
            reg_pat = [ r"\[", r"\]", r"\(", r"\)", r"\|", r"/", r"\.", r"\:" ]
            for pattern in reg_pat:
                page_dat = [re.sub(pattern, "", sent) for sent in page_dat]

            processed_words_page = remove_stopwords(invert_words(page_dat))[0] # '[0]' to flatten result list of remove_stopwords function
            
            pages.append([word for word in processed_words_page if (word in comp_list) and (word in all_words_to_be_processed)]) #  check which words are in both lists
    return pages

def invert_words(data_words_nostops):
    data_words_nostops_inverted = []
    for i in data_words_nostops:
        #print(i)
        #temp = [j[::-1] for j in i]
        #print(temp)
        #print([i for i in reversed(temp)])
        temp = [i for i in reversed(i)] # reversed
        #print(''.join(temp))
        temp = ''.join(temp)
        #print(temp)
        data_words_nostops_inverted.append(temp)
    return data_words_nostops_inverted

def grammization(data_words):
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=8, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    return bigram, trigram, bigram_mod, trigram_mod

# Define functions for stopwords, bigrams, trigrams and lemmatization, rm words smaller than 3 chars
def remove_stopwords(texts):

    # def flatten(lis): # helper to flatten results
    #     return [item for sublist in lis for item in sublist if item != '']

    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words and len(word) > 2] for doc in texts]

# unused
# def make_bigrams(texts):
#     return [bigram_mod[doc] for doc in texts]

# unused
# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]


def gensim_prep(word_list):
    # Create Dictionary
    id2word = corpora.Dictionary(word_list)
    id2word.filter_extremes(no_below=1, keep_n=75000)
    print('Length dictionary: ' + str(len(id2word)))

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in word_list]

    return id2word, corpus

def get_lda(corpus, id2word, num_topics, n_jobs=3, passes=50, chunksize=100, save=False, model_name='myldamodel', save_path=r'C:\Users\kantg\Documents\Uni\nlp_arabic\saved_models'):

    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, 
                                        id2word=id2word,
                                        num_topics=num_topics,
                                        random_state=100,
                                        chunksize=chunksize,
                                        workers=n_jobs, 
                                        passes=passes,
                                        per_word_topics=True)

    try:
        if save:
            # Save model to disk.
            #temp_file = datapath(model_name)
            save_path = save_path + '\\' + model_name
            print(save_path)
            lda_model.save(save_path)
    except:
        print('did not save LDA model!')
        
    return lda_model

def visu(lda_model, corpus, id2word, name):
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    pyLDAvis.save_html(vis, name + '.html')
    return

  and should_run_async(code)


In [4]:
# We need to redo the preprocessing steps with the base data to get the tokens pagewise:

# load most occuring words
with open('most_occuring_words_modified.txt', 'r', encoding='utf-8') as f:
    most_occuring_words_mod = f.readlines()
most_occuring_words_mod = [x.strip() for x in most_occuring_words_mod] # rm newline
most_occuring_words_mod = [x.split(sep=',')[0] for x in most_occuring_words_mod if x != ''] # rm non-needed string parts and empty strings

# load compareance list
with open('comparison_list_alittihad.pkl', 'rb') as f:
    comp_list = pickle.load(f)

# load talibs approved list of real words
with open('all_words_to_be_processed_edited_by_TA.txt', 'r', encoding='utf-8') as f:
    all_words_to_be_processed = f.readlines()
    all_words_to_be_processed = [word.replace('\n','') for word in all_words_to_be_processed if (word != '\n')]

# preprocess data
data_df = get_text(file_path)
data_df = insert_date(data_df) 
data_df = data_df.dropna(subset=['content'])
data_words_nostops_full_comp_plus_most_occ = preproc_pagewise(data_df, comp_list, all_words_to_be_processed)
data_df.reset_index(drop=True, inplace = True)
data_df['cleaned_tokens'] = pd.Series(data_words_nostops_full_comp_plus_most_occ) # append pagewise tokens

  and should_run_async(code)


In [5]:
data_df

  and should_run_async(code)


Unnamed: 0,content,page,file,date,cleaned_tokens
0,ـه 1440 ىلولأا ىدامج 25 سيمخلا ا ةرشاعلا ةنسلا...,1,the-islamic-state-22al-nabacc84e28099-newslett...,2018-12-21,"[تشاد, السويداء, لهالك, بحيرة, تلول, الصفا, بر..."
1,3\nةيحاتتفلاا\n167 ددعلا\nـه 1440 ىلولأا ىدامج...,3,the-islamic-state-22al-nabacc84e28099-newslett...,2018-12-21,"[يرد, كيد, صفر, أمامهم, الفضل, والسيطرة, نقاطه..."
2,4\n167 ددعلا راــبخأ\nـه 1440 ىلولأا ىدامج 25 ...,4,the-islamic-state-22al-nabacc84e28099-newslett...,2018-12-21,"[لتدمريها, وفجروها, تقل, الحسكة, الشدادي, زرع,..."
3,5\nراــبخأ 167 ددعلا\nـه 1440 ىلولأا ىدامج 25 ...,5,the-islamic-state-22al-nabacc84e28099-newslett...,2018-12-21,"[الطيانة, وإيقاع, مدرسة, منطق, دوار, العتال, ت..."
4,6\n167 ددعلا راــبخأ\nـه 1440 ىلولأا ىدامج 25 ...,6,the-islamic-state-22al-nabacc84e28099-newslett...,2018-12-21,"[منديل, لتدمريها, كاسحة, ألغام, بيك, إمام, عسك..."
...,...,...,...,...,...
3417,7\nراــبخأ ٢٢٢ ددعلا\nـه 1441 ةرخلآا ىدامج ٢٦ ...,7,The-Islamic-State-—-al-Nabā’-Newsletter-222.pdf,2020-01-10,"[دخول, وحدهم, وحتى, الشط, تشكل, عادة, مأوى, وع..."
3418,8\n٢٢٢ ددعلا راــبخأ\nـه 1441 ةرخلآا ىدامج ٢٦ ...,8,The-Islamic-State-—-al-Nabā’-Newsletter-222.pdf,2020-01-10,"[زمان, ومكان, يقيم, الوهاب, وحسن, رفيقا, فوجد,..."
3419,9\nراــبخأ ٢٢٢ ددعلا\nـه 1441 ةرخلآا ىدامج ٢٦ ...,9,The-Islamic-State-—-al-Nabā’-Newsletter-222.pdf,2020-01-10,"[خطر, خططهم, الهجومية, بناء, جهاز, هاتف, يتصل,..."
3420,10\n٢٢٢ ددعلا تلااقم\nـه 1441 ةرخلآا ىدامج ٢٦ ...,10,The-Islamic-State-—-al-Nabā’-Newsletter-222.pdf,2020-01-10,"[وعدنا, الخطاب, ونحن, الجليل, خطاب, العزة, الص..."


In [6]:
enemy_groups = {
    'far_enemies':['نصارى','صليبيون', 'صليبيين', 'صليبي', 'يهود', 'يهودي', 'يهودية'],
    'near_enemies':['نصيريون', 'نصيريين', 'نصيري', 'كرد', 'أكراد', 'شيعة', 'روافض', 'رافضة', 'شيعي', 'رافضي', 'كردي', 'يزيديون', 'يزيديين', 'ايزيدي', 'ايزيديون', 'ايزيديين', 'يزيدي', 'ايزيدي']
}

# additional enemy-related words
with open('List of enemies and related words.txt', 'r', encoding='utf-8') as f:
    enemy_related_words = f.readlines()
    enemy_related_words = [x.strip() for x in enemy_related_words] # rm newline
    enemy_related_words = [x.split(sep=',')[0] for x in enemy_related_words if x not in ['#Far Enemies', '#Near Enemies', '#Useful words']] # rm headlines
    
# once more, all enemies in one single list
all_enemies = enemy_groups['far_enemies'] + enemy_groups['near_enemies']
all_enemies_plus_related_words = all_enemies + enemy_related_words

  and should_run_async(code)


In [7]:
# count occurrences of words
def count_enemy_mentions_per_page(token_list, enemy_list):
    res = Counter()
    for enemy_word in enemy_list:
        res[enemy_word] = token_list.count(enemy_word)
    return [res, sum(res.values())] # return the counter and the sum of all enemy_words occurrences

data_df['enemy_words_occurrences_counter'], data_df['enemy_words_occurrences_sum'] = zip(*data_df['cleaned_tokens'].apply(lambda x: count_enemy_mentions_per_page(x, all_enemies_plus_related_words)))            

  and should_run_async(code)


In [8]:
data_df

  and should_run_async(code)


Unnamed: 0,content,page,file,date,cleaned_tokens,enemy_words_occurrences_counter,enemy_words_occurrences_sum
0,ـه 1440 ىلولأا ىدامج 25 سيمخلا ا ةرشاعلا ةنسلا...,1,the-islamic-state-22al-nabacc84e28099-newslett...,2018-12-21,"[تشاد, السويداء, لهالك, بحيرة, تلول, الصفا, بر...","{'نصارى': 0, 'صليبيون': 0, 'صليبيين': 0, 'صليب...",0
1,3\nةيحاتتفلاا\n167 ددعلا\nـه 1440 ىلولأا ىدامج...,3,the-islamic-state-22al-nabacc84e28099-newslett...,2018-12-21,"[يرد, كيد, صفر, أمامهم, الفضل, والسيطرة, نقاطه...","{'نصارى': 0, 'صليبيون': 0, 'صليبيين': 0, 'صليب...",1
2,4\n167 ددعلا راــبخأ\nـه 1440 ىلولأا ىدامج 25 ...,4,the-islamic-state-22al-nabacc84e28099-newslett...,2018-12-21,"[لتدمريها, وفجروها, تقل, الحسكة, الشدادي, زرع,...","{'نصارى': 0, 'صليبيون': 0, 'صليبيين': 0, 'صليب...",0
3,5\nراــبخأ 167 ددعلا\nـه 1440 ىلولأا ىدامج 25 ...,5,the-islamic-state-22al-nabacc84e28099-newslett...,2018-12-21,"[الطيانة, وإيقاع, مدرسة, منطق, دوار, العتال, ت...","{'نصارى': 0, 'صليبيون': 0, 'صليبيين': 0, 'صليب...",0
4,6\n167 ددعلا راــبخأ\nـه 1440 ىلولأا ىدامج 25 ...,6,the-islamic-state-22al-nabacc84e28099-newslett...,2018-12-21,"[منديل, لتدمريها, كاسحة, ألغام, بيك, إمام, عسك...","{'نصارى': 0, 'صليبيون': 0, 'صليبيين': 0, 'صليب...",2
...,...,...,...,...,...,...,...
3417,7\nراــبخأ ٢٢٢ ددعلا\nـه 1441 ةرخلآا ىدامج ٢٦ ...,7,The-Islamic-State-—-al-Nabā’-Newsletter-222.pdf,2020-01-10,"[دخول, وحدهم, وحتى, الشط, تشكل, عادة, مأوى, وع...","{'نصارى': 0, 'صليبيون': 0, 'صليبيين': 0, 'صليب...",0
3418,8\n٢٢٢ ددعلا راــبخأ\nـه 1441 ةرخلآا ىدامج ٢٦ ...,8,The-Islamic-State-—-al-Nabā’-Newsletter-222.pdf,2020-01-10,"[زمان, ومكان, يقيم, الوهاب, وحسن, رفيقا, فوجد,...","{'نصارى': 0, 'صليبيون': 0, 'صليبيين': 0, 'صليب...",0
3419,9\nراــبخأ ٢٢٢ ددعلا\nـه 1441 ةرخلآا ىدامج ٢٦ ...,9,The-Islamic-State-—-al-Nabā’-Newsletter-222.pdf,2020-01-10,"[خطر, خططهم, الهجومية, بناء, جهاز, هاتف, يتصل,...","{'نصارى': 0, 'صليبيون': 0, 'صليبيين': 0, 'صليب...",0
3420,10\n٢٢٢ ددعلا تلااقم\nـه 1441 ةرخلآا ىدامج ٢٦ ...,10,The-Islamic-State-—-al-Nabā’-Newsletter-222.pdf,2020-01-10,"[وعدنا, الخطاب, ونحن, الجليل, خطاب, العزة, الص...","{'نصارى': 0, 'صليبيون': 0, 'صليبيين': 0, 'صليب...",0


In [9]:
# Choose the pages with more than X-occurrences of enemy_related keywords and prepare the tokens for LDA:
pagewise_for_lda_chosen_tokens = data_df[data_df['enemy_words_occurrences_sum'] > 3]['cleaned_tokens']
id2word, corpus = gensim_prep(pagewise_for_lda_chosen_tokens)

save_dict = {'data_df': data_df, 'pagewise_for_lda_chosen_tokens': pagewise_for_lda_chosen_tokens, 'id2word': id2word, 'corpus': corpus }
with open('save_dict_pagewise_full.pkl', 'wb') as f:
    pickle.dump(save_dict, f, pickle.HIGHEST_PROTOCOL)

  and should_run_async(code)


Length dictionary: 14195


In [10]:
# lda estimation

for n in range(5,20,3):
    print('started training model with ' + str(n) + ' topics.')
    name = str(n) + '_topics_pagewise_full'
    lda = get_lda(corpus, id2word, n_jobs=7, num_topics=n, save=True, model_name=name)
    visu(lda, corpus, id2word, name)

  and should_run_async(code)


started training model with 5 topics.
C:\Users\kantg\Documents\Uni\nlp_arabic\saved_models\5_topics_pagewise_full
started training model with 8 topics.
C:\Users\kantg\Documents\Uni\nlp_arabic\saved_models\8_topics_pagewise_full
started training model with 11 topics.
C:\Users\kantg\Documents\Uni\nlp_arabic\saved_models\11_topics_pagewise_full
started training model with 14 topics.
C:\Users\kantg\Documents\Uni\nlp_arabic\saved_models\14_topics_pagewise_full
started training model with 17 topics.
C:\Users\kantg\Documents\Uni\nlp_arabic\saved_models\17_topics_pagewise_full


In [11]:
with open('save_dict_pagewise_full.pkl', 'rb') as f:
    load_dat = pickle.load(f)
load_dat.keys()

  and should_run_async(code)


dict_keys(['data_df', 'pagewise_for_lda_chosen_tokens', 'id2word', 'corpus'])