In [20]:
#Import Libraries
import os
import numpy as np
import tqdm
from tqdm import tqdm
import pandas as pd
#Machine Learning
from gensim.models import FastText
from gensim.models.phrases import Phrases, Phraser
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_non_alphanum
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#Text Pre-processing Using SpaCy
!python -m spacy download en_core_web_md
import spacy
from spacy.lang.en import English
from spacy import displacy
nlp = spacy.load('en_core_web_md')

Collecting en-core-web-md==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl (47.1 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [21]:
#Get the Data (Test on 3 chapters first)
df = pd.read_csv(r"C:\Users\Acer\anaconda3\activity\Sahih Bukhari Dataset (3 Chapters).csv") 
df.head()

Unnamed: 0,Source,Volume_No,Chapter_No,Hadith_No,Chapter,Hadith
0,Sahih Bukhari,1,1,1,Revelation,Narrated by 'Umar bin Al-Khattab: I heard All...
1,Sahih Bukhari,1,1,2,Revelation,Narrated by 'Aisha: (the mother of the faithfu...
2,Sahih Bukhari,1,1,3,Revelation,Narrated by 'Aisha: (the mother of the faithfu...
3,Sahih Bukhari,1,1,4,Revelation,Narrated by Said bin Jubair: Ibn 'Abbas in the...
4,Sahih Bukhari,1,1,5,Revelation,Narrated by Ibn 'Abbas: Allah's Apostle was th...


In [22]:
# Use spaCy to split each of the documents into a list of words (tokenization).
# Clean the data by removing stop words, punctuation and converting to lowercase using the Gensim library

#creates a list of documents with a list of words inside:
text = []
for i in df.Hadith.values:
  doc = nlp(remove_stopwords(strip_punctuation(strip_non_alphanum(str(i).lower()))))
  tokens = [token.text for token in doc]
  text.append(tokens)

In [23]:
#common_terms = ["of", "with", "without", "and", "or", "the", "a"]
# Create the relevant phrases from the list of sentences:
phrases = Phrases(text, threshold = 10, min_count=5)
# The Phraser object is used from now on to transform sentences
bigram = Phraser(phrases)
# Applying the Phraser to transform our sentences is simply
tokens = list(bigram[text])

In [24]:
#Train the fastText Model

model = FastText(tokens, vector_size=100, window=3, min_count=1, epochs=10, sorted_vocab=1)

In [25]:
#Create TF-IDF Scores for each Word

#Needs a list of lists for words and docs along with a fasttext 'model'
text = []
for i in tqdm(tokens):
  string = ' '.join(i)
  text.append(string)
tf_idf_vect = TfidfVectorizer(stop_words=None)
final_tf_idf = tf_idf_vect.fit_transform(text)
tfidf_feat = tf_idf_vect.get_feature_names()

100%|██████████| 137/137 [00:00<00:00, 137173.47it/s]


In [26]:
#Apply the Scores to the Vectors across each Document

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
errors=0
for sent in tqdm(tokens): # for each review/sentence
    sent_vec = np.zeros(100) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf [row, tfidf_feat.index(word)]
            sent_vec += (vec * tfidf)
            weight_sum += tfidf
        except:
            errors =+1
            pass
    sent_vec /= weight_sum
    #print(np.isnan(np.sum(sent_vec)))

    tfidf_sent_vectors.append(sent_vec)
    row += 1
print('errors noted: '+str(errors))

100%|██████████| 137/137 [00:01<00:00, 99.19it/s] 

errors noted: 1





In [27]:
# join these vectors back to the dataframe:
df['FT_tfidf'] = tfidf_sent_vectors

In [42]:
#Find Similar Documents

hadithText = 'reward'


query = [df.loc[df.Hadith.str.contains(hadithText)].iloc[0]['FT_tfidf']]
query = np.array(list(query))
query = np.nan_to_num(query)

vectors = np.array(list(df.FT_tfidf.values))
vectors = np.nan_to_num(vectors)

cosine_similarities = pd.Series(cosine_similarity(query, vectors).flatten())

for i,j in cosine_similarities.nlargest(10).iteritems():
  print ("\n")
  print ("Chapter: " + df.Chapter.iloc[i])
  print(str(i) + '-' + df.Hadith.iloc[i])
  print("Similarity: " + str(j))



Chapter: Revelation
0-Narrated by 'Umar bin Al-Khattab:  I heard Allah's Apostle saying, "The reward of deeds depends upon the intentions and every person will get the reward according to what he has intended. So whoever emigrated for worldly benefits or for a woman to marry, his emigration was for what he emigrated for."
Similarity: 1.0


Chapter: Belief
50-Narrated by 'Umar bin Al-Khattab: Allah's Apostle said, "The reward of deeds depends upon the intention and every person will get the reward according to what he has intended. So whoever emigrated for Allah and His Apostle, then his emigration was for Allah and His Apostle. And whoever emigrated for worldly benefits or for a woman to marry, his emigration was for what he emigrated for."
Similarity: 0.9999992698543603


Chapter: Belief
21-Narrated by Abu Said Al-Khudri: Allah's Apostle said, "While I was sleeping I saw (in a dream) some people wearing shirts of which some were reaching up to the breasts only while others were even