In [1]:
import pickle
import pandas as pd
import numpy as np
# Text preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import string
# Feature engineering/ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
data = pd.read_csv(r"C:\Users\Acer\anaconda3\activity\Sahih Bukhari Dataset (20 Chapters).csv")
data

Unnamed: 0,Source,Volume_No,Chapter_No,Hadith_No,Chapter,Text
0,Sahih Bukhari,1,1,1,Revelation,Narrated by 'Umar bin Al-Khattab: I heard All...
1,Sahih Bukhari,1,1,2,Revelation,Narrated by 'Aisha: (the mother of the faithfu...
2,Sahih Bukhari,1,1,3,Revelation,Narrated by 'Aisha: (the mother of the faithfu...
3,Sahih Bukhari,1,1,4,Revelation,Narrated by Said bin Jubair: Ibn 'Abbas in the...
4,Sahih Bukhari,1,1,5,Revelation,Narrated by Ibn 'Abbas: Allah's Apostle was th...
...,...,...,...,...,...,...
1050,Sahih Bukhari,2,20,216,Shortening Prayers (At-Taqseer),Narrated by 'Imran bin Husain (who had piles):...
1051,Sahih Bukhari,2,20,217,Shortening Prayers (At-Taqseer),Narrated by 'Abdullah bin Buraida: 'Imran bin ...
1052,Sahih Bukhari,2,20,218,Shortening Prayers (At-Taqseer),"Narrated by 'Imran bin Husain had piles, so I ..."
1053,Sahih Bukhari,2,20,219,Shortening Prayers (At-Taqseer),Narrated by 'Aisha (the mother of the faithful...


In [3]:
# Tokenization (split into tokens)

def word_tokenization(text):
    text = word_tokenize(text.lower())
    return " ".join([word for word in str(text).split()])

data['Text'] = data['Text'].apply(lambda text: word_tokenization(text))
data.head()

Unnamed: 0,Source,Volume_No,Chapter_No,Hadith_No,Chapter,Text
0,Sahih Bukhari,1,1,1,Revelation,"['narrated', 'by', ""'umar"", 'bin', 'al-khattab..."
1,Sahih Bukhari,1,1,2,Revelation,"['narrated', 'by', ""'aisha"", ':', '(', 'the', ..."
2,Sahih Bukhari,1,1,3,Revelation,"['narrated', 'by', ""'aisha"", ':', '(', 'the', ..."
3,Sahih Bukhari,1,1,4,Revelation,"['narrated', 'by', 'said', 'bin', 'jubair', ':..."
4,Sahih Bukhari,1,1,5,Revelation,"['narrated', 'by', 'ibn', ""'abbas"", ':', 'alla..."


In [4]:
# Removing punctuations 
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data['Text'] = data['Text'].apply(remove_punctuations)
data.head()

Unnamed: 0,Source,Volume_No,Chapter_No,Hadith_No,Chapter,Text
0,Sahih Bukhari,1,1,1,Revelation,narrated by umar bin alkhattab i heard allah ...
1,Sahih Bukhari,1,1,2,Revelation,narrated by aisha the mother of the faithful...
2,Sahih Bukhari,1,1,3,Revelation,narrated by aisha the mother of the faithful...
3,Sahih Bukhari,1,1,4,Revelation,narrated by said bin jubair ibn abbas in the ...
4,Sahih Bukhari,1,1,5,Revelation,narrated by ibn abbas allah s apostle was the...


In [5]:
#Importing Nltk stopword package
nltk.download('stopwords')
from nltk.corpus import stopwords

#Loading Stopwords into a list
NLTK_stop_words_list=stopwords.words('english')
print(NLTK_stop_words_list)
print("Total numbers of stop words are ")
print(len(NLTK_stop_words_list))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#1-Create stopwords list (removing sanad)
custom_stopword_list=['narrated','narrator','umar','bin','alkhattab','abu','aisha','etc','ie','us','narrate','narrates','narrating','narration', 
                      'narrations','narrators','said','abbas','aas','abul','aba','amr','abdullah','huraira','ibn','jabir','jubair','alansari','musa',
                      'abul','abulqasim','anas','ubada','assamit','alkhudri','sad','almarur','alahnaf','qais','albara','azib',
                      'talha','ubaidullah','annuman','bashir','jamra','masud','abi','waqqas','jarir','ziyad',
                      'ilaqa','malik','waqid','allaithi','abdurrahman','bakra','father','wail','muawiya','mahmud',
                      'rabia','alaas','asma','mulaika','alansari','zaid','khalid','aljuhani','burda','alas','ali',
                      'azzubair','salama','ashhubi','um','ammar','aswad','attufail','nafi','nuam','almujmir','tamim',
                      'kuraib','usama','ata','yasar','aiyub','qatada','humran','urwa','uthman','affan','muhammad',
                      'juraij','ubaid','umatiya','seereen','hamza','adi','hatim','almughira','shuba','bakr','yahya',
                      'almazini','juhaifa','shihab','assaib','yazid','thabit','jafar','umaiya','addamri','suwaid',
                      'alnuman','maimuna','amir','numan','qais','binti','bint','mihsin','hudhaifa','wail','sulaiman',
                      'yasar','maimun','qilaba','hazim','mutim','alharith','almuntathir','hisham','hani','talib','umsalama',
                      'ubai','kab','alqasim','alaswad','muadha','zainab','samura','jundub','juhaim','abza','imran','shaqiq',
                      'alamash','husain','alkhuzai','dhar','almunkadir','murra','sahl','mughira','shuba','abdul','aziz','uqba',
                      'shaddad','ishaq','maslama','ibrahim','siyah','dinar','mujahid','bara','itban','attaiyah','alaslami',
                      'ikrima','alkhaulani','hassan','rafi','alsaib','abbad','aun','alubaid','busr','ghailan','azzuhri',
                      'alghifar','alminhal','saiyar','hunaif','almahh','salim','khadij','almuzani','barza','malih',
                      'abulminhal','qurra','humaid','isa','hafsa','mughaffal','almuzani','huwairith','buhaina','rabi',
                      'assaidi','utba','khiyar','muadh','jabal','mamar','marwan','alhakam','mutarrif','musab','wahb','rifaa',
                      'azzuraqi','ata','mabad','warrad','ashshaibani','abis','umm','salmanalfarsi','tawus','salman','umama',
                      'fatima','almundhir','taghlib','hummaid','almiswar','makhrama','shuaib','althaqafi','masruq','namir',
                      'sharik','addahhak','zahdam','raja','alutaridi']

In [7]:
final_stopword_list = custom_stopword_list + NLTK_stop_words_list
print(final_stopword_list)
print(len(final_stopword_list))

['narrated', 'narrator', 'umar', 'bin', 'alkhattab', 'abu', 'aisha', 'etc', 'ie', 'us', 'narrate', 'narrates', 'narrating', 'narration', 'narrations', 'narrators', 'said', 'abbas', 'aas', 'abul', 'aba', 'amr', 'abdullah', 'huraira', 'ibn', 'jabir', 'jubair', 'alansari', 'musa', 'abul', 'abulqasim', 'anas', 'ubada', 'assamit', 'alkhudri', 'sad', 'almarur', 'alahnaf', 'qais', 'albara', 'azib', 'talha', 'ubaidullah', 'annuman', 'bashir', 'jamra', 'masud', 'abi', 'waqqas', 'jarir', 'ziyad', 'ilaqa', 'malik', 'waqid', 'allaithi', 'abdurrahman', 'bakra', 'father', 'wail', 'muawiya', 'mahmud', 'rabia', 'alaas', 'asma', 'mulaika', 'alansari', 'zaid', 'khalid', 'aljuhani', 'burda', 'alas', 'ali', 'azzubair', 'salama', 'ashhubi', 'um', 'ammar', 'aswad', 'attufail', 'nafi', 'nuam', 'almujmir', 'tamim', 'kuraib', 'usama', 'ata', 'yasar', 'aiyub', 'qatada', 'humran', 'urwa', 'uthman', 'affan', 'muhammad', 'juraij', 'ubaid', 'umatiya', 'seereen', 'hamza', 'adi', 'hatim', 'almughira', 'shuba', 'bakr'

In [8]:
# Removing stopwords 
stops = set(final_stopword_list)
def remove_stopwords(text):
    return " ".join([word for word in str(text).split()
                    if word not in stops])

data['Text'] = data['Text'].apply(lambda text: remove_stopwords(text))
data.head()

Unnamed: 0,Source,Volume_No,Chapter_No,Hadith_No,Chapter,Text
0,Sahih Bukhari,1,1,1,Revelation,heard allah apostle saying reward deeds depend...
1,Sahih Bukhari,1,1,2,Revelation,mother faithful believers asked allah apostle ...
2,Sahih Bukhari,1,1,3,Revelation,mother faithful believers commencement divine ...
3,Sahih Bukhari,1,1,4,Revelation,explanation statement allah move tongue concer...
4,Sahih Bukhari,1,1,5,Revelation,allah apostle generous people used reach peak ...


In [9]:
# Stemming (valid stems)
porter = PorterStemmer()
def stemming(text):
    token_words=word_tokenize(text)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

data['Text'] = data['Text'].apply(lambda text: stemming(text))
data.head()

Unnamed: 0,Source,Volume_No,Chapter_No,Hadith_No,Chapter,Text
0,Sahih Bukhari,1,1,1,Revelation,heard allah apostl say reward deed depend upon...
1,Sahih Bukhari,1,1,2,Revelation,mother faith believ ask allah apostl allah apo...
2,Sahih Bukhari,1,1,3,Revelation,mother faith believ commenc divin inspir allah...
3,Sahih Bukhari,1,1,4,Revelation,explan statement allah move tongu concern qura...
4,Sahih Bukhari,1,1,5,Revelation,allah apostl gener peopl use reach peak genero...


In [10]:
# Create a dictionary with the label codification (Classify hadith text by chapter)
# Example: Revelation = 0, Belief = 1, Knowledge = 2

data['Label'] = data['Chapter'].factorize()[0]
from io import StringIO
category_id_df = data[['Chapter', 'Label']].drop_duplicates().sort_values('Label')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['Label', 'Chapter']].values)

data.head()
#data.tail()

Unnamed: 0,Source,Volume_No,Chapter_No,Hadith_No,Chapter,Text,Label
0,Sahih Bukhari,1,1,1,Revelation,heard allah apostl say reward deed depend upon...,0
1,Sahih Bukhari,1,1,2,Revelation,mother faith believ ask allah apostl allah apo...,0
2,Sahih Bukhari,1,1,3,Revelation,mother faith believ commenc divin inspir allah...,0
3,Sahih Bukhari,1,1,4,Revelation,explan statement allah move tongu concern qura...,0
4,Sahih Bukhari,1,1,5,Revelation,allah apostl gener peopl use reach peak genero...,0


In [11]:
# Train Test Split

X = data['Text']
y = data['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

In [12]:
# Feature Engineering (TF-IDF vectors)
# fitted and then transformed the training set, but only transformed the test set
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)
# Answer: each of 844 hadith texts (train) is represented by 300 features, each of 211 hadith texts (test) is represented by 300 features
# representing the TF-IDF score for different unigrams and bigrams

(844, 300)
(211, 300)


In [13]:
# use the Chi squared test in order to see what unigrams and bigrams are most correlated with each category

from sklearn.feature_selection import chi2
import numpy as np

for Chapter, Label in sorted(category_to_id.items()):
    features_chi2 = chi2(features_train, labels_train == Label)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names_out())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' Category:".format(Chapter))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'Ablution (Wudu')' Category:
  . Most correlated unigrams:
. wet
. answer
. wash
. water
. ablut
  . Most correlated bigrams:
. prophet ask
. perform ablut

# 'Ablution with dust' Category:
  . Most correlated unigrams:
. hand
. back
. suffici
. statement
. earth
  . Most correlated bigrams:
. allah apostl
. prophet use

# 'Bathing (Ghusl)' Category:
  . Most correlated unigrams:
. pot
. bodi
. wash
. pour
. bath
  . Most correlated bigrams:
. take bath
. pour water

# 'Belief' Category:
  . Most correlated unigrams:
. islam
. muslim
. reward
. religion
. faith
  . Most correlated bigrams:
. allah allah
. worship allah

# 'Call to Prayer (Adhaan)' Category:
  . Most correlated unigrams:
. congreg
. lead
. adhan
. pronounc
. iqama
  . Most correlated bigrams:
. lead prayer
. pronounc adhan

# 'Characteristics of Prayer' Category:
  . Most correlated unigrams:
. rais
. rak
. prostrat
. bow
. takbir
  . Most correlated bigrams:
. rais head
. say takbir

# 'Dua' for Rain (Istisqaa)' Cate

In [14]:
# Save the files we'll need in the next steps

# X_train
with open('X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)
    
# X_test    
with open('X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)
    
# y_train
with open('y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
# y_test
with open('y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
# data
with open('data.pickle', 'wb') as output:
    pickle.dump(data, output)
    
# features_train
with open('features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

# labels_train
with open('labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

# features_test
with open('features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)
    
# labels_test
with open('labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)
    
# TF-IDF object
with open('tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)