In [5]:
import os
from gensim.models.ldamulticore import LdaMulticore
from gensim.similarities import MatrixSimilarity
from gensim.models.lsimodel import LsiModel
from gensim.models import TfidfModel

In [9]:
import tarfile
import itertools
import gensim
import logging, gensim, bz2
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from os import listdir
from os.path import isfile, join
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re

In [2]:
train_path = './data/train'
test_path = './data/test'
def docs_names(path):
    names=[]
    for file in os.listdir(path):
        for s in os.listdir(path+'/'+file):
            names.append(path+'/'+file+'/'+s)
    return names

In [10]:
STOPWORDS =['vide']
def remove_extra_whitespace(string):
    string = re.sub(r'\s+', ' ', string)
    string = re.sub(r'[^\u0600-\u06FF]', ' ', string)
    return re.sub(r"\s{2,}", " ", string).strip()
def process_message(message): 
    content =remove_extra_whitespace(message)
    return content.split()
def get_corpus_by_names(names):
    corpus=[]
    for file_name in names:
        if isfile(join(file_name)):
            with open(file_name, 'r',encoding='utf8') as f_:
                text = process_message(f_.read())
                corpus.append(text)
    return corpus
def get_num_of_topics():
    return 10

In [11]:
train_names = docs_names(train_path)
train_corpus = get_corpus_by_names(train_names)

In [None]:
test_names = docs_names(test_path)
test_corpus = get_corpus_by_names(test_names)

In [12]:
dictionary = gensim.corpora.dictionary.Dictionary(train_corpus)
dictionary.filter_extremes(no_below=10, no_above=0.1)

In [13]:
tfidf_model = TfidfModel(train_corpus,dictionary=dictionary)

In [14]:
def get_corpus_bow(corpus, dictionary):
    corpus_bow=[]
    for tokens in corpus:
        corpus_bow.append(dictionary.doc2bow(tokens))
    return corpus_bow
def get_corpus_tfidf(corpus, tfidf_model,dictionary):
    corpus_tfidf = get_corpus_bow(corpus, dictionary)
    corpus_tfidf = tfidf_model[corpus_tfidf]
    return corpus_tfidf

In [15]:
train_corpus_bow = get_corpus_bow(train_corpus, dictionary)
train_corpus_tfidf = get_corpus_tfidf(train_corpus, tfidf_model,dictionary)

In [17]:
corpus = train_corpus_tfidf

# LSA

In [None]:
lsi_model = LsiModel(corpus=corpus, id2word=dictionary, num_topics=get_num_of_topics())

# LDA

In [22]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=get_num_of_topics(), passes=1)

In [23]:
model = lda_model

# Similarity index

In [24]:
index = MatrixSimilarity(model[corpus])

#  Most similar BOW

In [25]:
def most_similar_bow(doc_name,dict_model,index_model,model,top=5):
    # top most similar document in the training set to doc_name
    print('document name : ', doc_name)
    doc =get_corpus_by_names([doc_name])
    doc = doc[0]
    doc_rep =dict_model.doc2bow(doc)
    doc_vec = model[doc_rep]
    sims =index_model[doc_vec]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    for i in sims[:top]:
        print('document : ',i[0],' similarity: ',i[1])

In [28]:
pat='./data/train/1/اقتصاد cnn businesscnnAr09biz (690).html.txt.txt'

In [30]:
most_similar_bow(train_names[888],dictionary,index,model,top=15)

document name :  ./data/train/1/اقتصاد cnn businesscnnAr09biz (690).html.txt.txt
document :  733  similarity:  0.999807
document :  779  similarity:  0.999655
document :  15776  similarity:  0.999463
document :  1810  similarity:  0.999334
document :  675  similarity:  0.999296
document :  14567  similarity:  0.999232
document :  13904  similarity:  0.99914
document :  14404  similarity:  0.99906
document :  9825  similarity:  0.999051
document :  14650  similarity:  0.999049
document :  834  similarity:  0.998869
document :  11  similarity:  0.998819
document :  13  similarity:  0.998819
document :  23  similarity:  0.998819
document :  24  similarity:  0.998819


#  Most similar TF-IDF

In [31]:
def most_similar_tfidf(doc_name,dict_model,index_model,tfidf_model,model,top=5):
    # top most similar document in the training set to doc_name
    print('document name : ', doc_name)
    doc =get_corpus_by_names([doc_name])
    doc = doc[0]
    doc_rep =tfidf_model[dict_model.doc2bow(doc)]
    doc_vec = model[doc_rep]
    sims =index_model[doc_vec]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    for i in sims[:top]:
        print('document : ',i[0],' similarity: ',i[1])

In [33]:
most_similar_tfidf(train_names[888],dictionary,index,tfidf_model,model)

document name :  ./data/train/1/اقتصاد cnn businesscnnAr09biz (690).html.txt.txt
document :  11  similarity:  1.0
document :  13  similarity:  1.0
document :  23  similarity:  1.0
document :  24  similarity:  1.0
document :  25  similarity:  1.0


# TF-IDF alone

In [None]:
#index_tfidf = gensim.similarities.MatrixSimilarity(get_corpus_tfidf(train_corpus, tfidf_model,dictionary))
# train_corpus_tfidf = get_corpus_tfidf(train_corpus, tfidf_model,dictionary)
index_tfidf = gensim.similarities.MatrixSimilarity(train_corpus_tfidf)

In [None]:
def most_similar_tfidf(doc_name,dict_model,index_model,tfidf_model,top=5):
    # top most similar document in the training set to doc_name
    print('document name : ', doc_name)
    doc =get_corpus_by_names([doc_name])
    doc = doc[0]
    doc_rep =tfidf_model[dict_model.doc2bow(doc)]
    sims =index_model[doc_rep]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    for i in sims[:top]:
        print('document : ',i[0],' similarity: ',i[1])

In [None]:
most_similar_tfidf(test_names[888],dictionary,index,tfidf_model)