# NLP - IS Newsletters

In this script the following pipeline is implemented

* read in PDF files via *pdfplumber* --> dataframe format
* pre-preprocess text --> list of lists of strings format
* **invert** the single words in order to make text readable
* preprocess text --> gensim formats
* estimate LDA
* visualize via *pyLDAvis*

Code used from:
* https://github.com/bakrianoo/aravec/tree/master/AraVec%202.0
* https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/

### Specify File Path

In [1]:
# specify file path

file_path = 'read_pdfs'

### Import Packages

Be sure to have the necessary packages installed locally. Be sure to download NLTK stopword list by uncommenting and running the following cell.

In [3]:
#import nltk; nltk.download('stopwords')

In [2]:
#Generic
import os
from os.path import join
import matplotlib.pyplot as plt
%matplotlib inline
import re
import numpy as np
import pandas as pd
from pprint import pprint

#NLTK Stopword List
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('arabic')

#Gensim (LDA-Modelling)
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#PDF-Reader
import pdfplumber



### ToDo: Extend Stopword List

In [5]:
'''
stop_words.extend(
'example1',
'example2'
)
'''

"\nstop_words.extend(\n'example1',\n'example2'\n)\n"

## Read PDFs

In [3]:
data_list = []
pages_list = []
file_list = []

for index, filename in enumerate(os.listdir(file_path)):
    if filename.endswith(".pdf"):
        with pdfplumber.open(join(file_path,filename)) as stream_pdf:
            pages_content = [i.extract_text() for i in stream_pdf.pages]
            number_pages = len(pages_content)
            data_list.extend(pages_content)
            pages_list.extend(i+1 for i in range(number_pages))
            file_list.extend([filename] * number_pages)
            
data_df = pd.DataFrame({'content': data_list,
                        'page': pages_list,
                        'file': file_list,
                        'date': np.nan})

### Drop Empty content (to avoid problems in preprocessing)

In [54]:
for index, page in enumerate(data_df.content):
    if page is None:
        data_df.drop(index, inplace = True)
print('Check if None remaining: ')
print(any([page is None for page in data_df.content]))
data_df.head()

Check if None remaining: 
False


Unnamed: 0,content,page,file,date
0,8 ةعباسلا ةنسلا l نوثلاثلاو سداسلا ددعلا\nـه ...,1,the-islamic-state-22al-nabacc84_-newsletter-36...,
2,3\nـه 1437/9/16 | نوثلاثلاو سداسلا ددعلا\n يد...,3,the-islamic-state-22al-nabacc84_-newsletter-36...,
3,4\nـه 1437/9/16 | نوثلاثلاو سداسلا ددعلا\nراـ...,4,the-islamic-state-22al-nabacc84_-newsletter-36...,
4,5\nـه 1437/9/16 | نوثلاثلاو سداسلا ددعلا\nراـ...,5,the-islamic-state-22al-nabacc84_-newsletter-36...,
5,6\nـه 1437/9/16 | نوثلاثلاو سداسلا ددعلا\nراـ...,6,the-islamic-state-22al-nabacc84_-newsletter-36...,


## Preprocessing

In [55]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
    
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [64]:
data = data_df['content'].values.tolist()
data_clean = [clean_str(i) for i in data]

In [67]:
data_words = list(sent_to_words(data_clean))

In [68]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [70]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [71]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

### AraVec Word Embeddings

In [81]:
# load the model
model = gensim.models.Word2Vec.load('wiki_cbow_100/wikipedia_cbow_100')

In [86]:
# get a word vector
word_vector = [model.wv[page_content] for page_content in data_words_nostops]

KeyError: "Key 'هعباسلا' not present"

### Continue gensim procedure

In [84]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_nostops)

# Create Corpus
texts = data_words_nostops

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 4), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 3), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1)]]


In [16]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('االستهزاء', 1),
  ('اثريا', 1),
  ('احذر', 1),
  ('اسبوعية', 1),
  ('استعادة', 1),
  ('اطــراف', 1),
  ('اكفاركم', 1),
  ('الثالثاء', 1),
  ('الجاللة', 1),
  ('الجيش', 1),
  ('الخالفــة', 1),
  ('الرقة', 1),
  ('الرمادي', 1),
  ('السابعة', 1),
  ('السادس', 1),
  ('السنة', 1),
  ('السيطرة', 1),
  ('الصحيفة', 1),
  ('العدد', 1),
  ('الفاظ', 1),
  ('الفلوجــة', 1),
  ('النصيري', 1),
  ('اليــة', 1),
  ('اوليــاء', 1),
  ('اوليكم', 1),
  ('بال', 1),
  ('بالدين', 1),
  ('بعضهــم', 1),
  ('بعــض', 1),
  ('تحتوي', 1),
  ('تركها', 1),
  ('تع', 2),
  ('جنــود', 1),
  ('حكمه', 1),
  ('حملة', 1),
  ('خير', 1),
  ('دد', 1),
  ('رافضيــا', 1),
  ('رمضان', 1),
  ('ريف', 1),
  ('صحيفة', 1),
  ('صوره', 1),
  ('طريق', 1),
  ('طوزخورماتو', 1),
  ('عامة', 1),
  ('علــى', 1),
  ('فــي', 1),
  ('قرانية', 1),
  ('قرى', 1),
  ('كسر', 1),
  ('مدينــة', 1),
  ('مقتــل', 1),
  ('مكان', 1),
  ('من', 3),
  ('مناطق', 1),
  ('منبج', 1),
  ('مهين', 1),
  ('هجوم', 1),
  ('هـ', 1),
  ('واحاديث', 1),
  ('واسع', 1),

### Run LDA-Algorithm

In [17]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [18]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.032*"من" + 0.016*"عىل" + 0.013*"جنود" + 0.011*"اىل" + 0.010*"ان" + '
  '0.009*"االسالمية" + 0.009*"الدولة" + 0.008*"الخالفة" + 0.006*"منطقة" + '
  '0.006*"الجيش"'),
 (1,
  '0.018*"من" + 0.008*"ان" + 0.008*"عىل" + 0.006*"اىل" + 0.005*"ما" + '
  '0.005*"الله" + 0.005*"او" + 0.004*"التي" + 0.004*"الناس" + 0.003*"عن"'),
 (2,
  '0.029*"الله" + 0.026*"من" + 0.013*"ان" + 0.010*"عىل" + 0.009*"عليه" + '
  '0.008*"قال" + 0.006*"ما" + 0.006*"صىل" + 0.006*"وسلم" + 0.005*"كان"'),
 (3,
  '0.018*"من" + 0.008*"الله" + 0.008*"عىل" + 0.008*"ان" + 0.006*"او" + '
  '0.005*"اىل" + 0.004*"عن" + 0.003*"ما" + 0.003*"كام" + 0.003*"الناس"')]


In [19]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_nostops_inverted, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.170767777814783

Coherence Score:  0.375725394957954


In [20]:
#inserted import (old version conflict solved)
import pyLDAvis.gensim_models

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'arabic_lda.html')
vis

KeyboardInterrupt: 