In [1]:
import tarfile
import itertools
import gensim
import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from nltk.tokenize import RegexpTokenizer
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from os import listdir
from os.path import isfile, join
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re



In [2]:
import os
train_path = './data/train'
test_path = './data/test'
def docs_names(path):
    names=[]
    for file in os.listdir(path):
        for s in os.listdir(path+'/'+file):
            names.append(path+'/'+file+'/'+s)
    return names

In [3]:
STOPWORDS =['vide']
def remove_extra_whitespace(string):
    string = re.sub(r'\s+', ' ', string)
    string = re.sub(r'[^\u0600-\u06FF]', ' ', string)
    return re.sub(r"\s{2,}", " ", string).strip()
def process_message(message): 
    content =remove_extra_whitespace(message)
    return content.split()
def get_corpus_by_names(names):
    corpus=[]
    for file_name in names:
        if isfile(join(file_name)):
            with open(file_name, 'r',encoding='utf8') as f_:
                text = process_message(f_.read())
                corpus.append(text)
    return corpus
def get_num_of_topics():
    return 10

In [4]:
train_names = docs_names(train_path)

train_corpus = get_corpus_by_names(train_names)


In [11]:
del train_corpus
test_names = docs_names(test_path)
test_corpus = get_corpus_by_names(test_names)

In [5]:
"""
id2word: Dictionary of the entire training vocabulary
         with key being an id to get the word in the vocabulary.
         The dictionary also has the statistics of the words in the vacabulary. 
"""
id2word = gensim.corpora.dictionary.Dictionary(train_corpus)
# Remove all words which appear in less than 10 documents and in more than 10% of the documents.
id2word.filter_extremes(no_below=10, no_above=0.1)

2017-06-17 16:35:20,897 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-06-17 16:35:34,406 : INFO : adding document #10000 to Dictionary(441744 unique tokens: ['أعجل', 'بألعابه', 'مستحب؛', 'وصيني', 'الزركلي']...)
2017-06-17 16:35:42,545 : INFO : adding document #20000 to Dictionary(539640 unique tokens: ['أعجل', 'بألعابه', 'مستحب؛', 'وصيني', 'الزركلي']...)
2017-06-17 16:35:44,846 : INFO : built Dictionary(598188 unique tokens: ['أعجل', 'أتعينيني', 'بألعابه', 'مستحب؛', 'وصيني']...) from 21429 documents (total 16277388 corpus positions)
2017-06-17 16:35:46,118 : INFO : discarding 515919 tokens: [('من', 21279), ('العام', 7337), ('ذلك', 10258), ('البحث', 6237), ('الثاني', 3961), ('وضع', 2211), ('فابلغه', 1), ('ان', 3998), ('ماساتشوستس', 5), ('اختراعها', 3)]...
2017-06-17 16:35:46,120 : INFO : keeping 82269 tokens which were in no less than 10 and no more than 2142 (=10.0%) documents
2017-06-17 16:35:46,655 : INFO : resulting dictionary: Dictionary(82269 unique tokens: [

In [None]:
id2word[82268]

In [6]:
def get_corpus_bow(corpus, dictionary):
    corpus_bow=[]
    for tokens in corpus:
        corpus_bow.append(dictionary.doc2bow(tokens))
    return corpus_bow

In [7]:
"""
Create a generator which will yield a bag of words 
for each document in all the files belonging to every directory in dir_paths_train.
"""
#corpus_names=get_corpus_names(tr)
train_corpus_bow = get_corpus_bow(train_corpus, id2word)

In [9]:
lsi = gensim.models.lsimodel.LsiModel(corpus=train_corpus_bow, id2word=id2word, num_topics=get_num_of_topics())

2017-06-17 16:36:23,361 : INFO : using serial LSI version on this node
2017-06-17 16:36:23,363 : INFO : updating model with new documents
2017-06-17 16:36:23,372 : INFO : preparing a new chunk of documents
2017-06-17 16:36:25,139 : INFO : using 100 extra samples and 2 power iterations
2017-06-17 16:36:25,155 : INFO : 1st phase: constructing (82269, 110) action matrix
2017-06-17 16:36:26,305 : INFO : orthonormalizing (82269, 110) action matrix
2017-06-17 16:36:33,919 : INFO : 2nd phase: running dense svd on (110, 20000) matrix
2017-06-17 16:36:34,889 : INFO : computing the final decomposition
2017-06-17 16:36:34,889 : INFO : keeping 10 factors (discarding 46.211% of energy spectrum)
2017-06-17 16:36:34,985 : INFO : processed documents up to #20000
2017-06-17 16:36:35,037 : INFO : topic #0(1866.293): 0.487*"باشا" + 0.353*"بنو" + 0.334*"العمال" + 0.221*"أمراء" + 0.176*"أبي" + 0.159*"شاه" + 0.150*"آل" + 0.120*"إبراهيم" + 0.116*"يوسف" + 0.108*"ابن"
2017-06-17 16:36:35,040 : INFO : topic #1(

# Similarity index

In [37]:
index = gensim.similarities.MatrixSimilarity(lsi[train_corpus_bow])
# work with sub set of training set
#index = gensim.similarities.MatrixSimilarity(lsi[train_corpus_bow[11:5855]])

2017-06-17 16:56:31,922 : INFO : creating matrix with 5844 documents and 10 features


In [33]:
def most_similar(doc_name,id2word_model,index_model,lsi_model,top=3):
    # top most similar document in the training set to doc_name
    print(doc_name)
    doc =get_corpus_by_names([doc_name])
    doc = doc[0]
    doc_bow =id2word_model.doc2bow(doc)
    vec_lsi = lsi_model[doc_bow]
    sims =index_model[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    print(sims[:top])
    for i in sims[:top]:
        print('document : ',i[0],' similarity: ',i[1])

In [38]:
most_similar(test_names[888],id2word,index,lsi)

./data/test/8/قانون القانون الليبي القانون الليبي index2d5ee-0386.html.txt.txt
[(5330, 0.98044628), (5300, 0.97971916), (5362, 0.97904354)]
document :  5330  similarity:  0.980446
document :  5300  similarity:  0.979719
document :  5362  similarity:  0.979044


In [29]:
# the document at index 1828
train_names[1828]

'./data/train/1/اقتصاد اقتصاد رشيف الاخبار الاقتصادية من مصرف سوريا المركزي news13-ar (36)-1568.htm.txt.txt'

In [16]:
sims = index[vec_lsi]

In [17]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [18]:
sims[:10]

[(3015, 0.99911177),
 (3087, 0.9990747),
 (3282, 0.9989987),
 (3278, 0.9988277),
 (3205, 0.99842346),
 (3228, 0.99833471),
 (3111, 0.99806339),
 (3354, 0.99795169),
 (3326, 0.99792635),
 (3117, 0.99770772)]

In [19]:
train_names[3015]

'./data/train/10/وصفات واكلات فتافيت فتافيت main0988-0021.html.txt.txt'

In [None]:
sims