In [1]:
# Title: Text Pre Processing - (Test on 3 chapters first - Revelation, Belief & Knowledge)
# Activity: 1-Create custom stopwords list | 2-Read data | 3-Tokenization | 4-Removing punctuations 
#         | 5-Removing stopwords | 6-Stemming   

In [2]:
#import modules
import numpy as np
from collections import Counter
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer
import string

In [3]:
#Importing Nltk stopword package
nltk.download('stopwords')
from nltk.corpus import stopwords

#Loading Stopwords into a list
NLTK_stop_words_list=stopwords.words('english')
print(NLTK_stop_words_list)
print("Total numbers of stop words are ")
print(len(NLTK_stop_words_list))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#1-Create stopwords list (removing sanad)
custom_stopword_list=['narrated','narrator','umar','bin','alkhattab','abu','aisha','etc','ie','us','narrate','narrates','narrating','narration', 
                      'narrations','narrators','said','abbas','aas','abul','aba','amr','abdullah','huraira','ibn','jabir','jubair','alansari','musa',
                      'abul','abulqasim','anas','ubada','assamit','alkhudri','sad','almarur','alahnaf','qais','albara','azib',
                      'talha','ubaidullah','annuman','bashir','jamra','masud','abi','waqqas','jarir','ziyad',
                      'ilaqa','malik','waqid','allaithi','abdurrahman','bakra','father','wail','muawiya','mahmud',
                      'rabia','alaas','asma','mulaika','alansari','zaid','khalid','aljuhani','burda','alas','ali',
                      'azzubair','salama','ashhubi','um','ammar','aswad','attufail','nafi','nuam','almujmir','tamim',
                      'kuraib','usama','ata','yasar','aiyub','qatada','humran','urwa','uthman','affan','muhammad',
                      'juraij','ubaid','umatiya','seereen','hamza','adi','hatim','almughira','shuba','bakr','yahya',
                      'almazini','juhaifa','shihab','assaib','yazid','thabit','jafar','umaiya','addamri','suwaid',
                      'alnuman','maimuna','amir','numan','qais','binti','bint','mihsin','hudhaifa','wail','sulaiman',
                      'yasar','maimun','qilaba','hazim','mutim','alharith','almuntathir','hisham','hani','talib','umsalama',
                      'ubai','kab','alqasim','alaswad','muadha','zainab','samura','jundub','juhaim','abza','imran','shaqiq',
                      'alamash','husain','alkhuzai','dhar','almunkadir','murra','sahl','mughira','shuba','abdul','aziz','uqba',
                      'shaddad','ishaq','maslama','ibrahim','siyah','dinar','mujahid','bara','itban','attaiyah','alaslami',
                      'ikrima','alkhaulani','hassan','rafi','alsaib','abbad','aun','alubaid','busr','ghailan','azzuhri',
                      'alghifar','alminhal','saiyar','hunaif','almahh','salim','khadij','almuzani','barza','malih',
                      'abulminhal','qurra','humaid','isa','hafsa','mughaffal','almuzani','huwairith','buhaina','rabi',
                      'assaidi','utba','khiyar','muadh','jabal','mamar','marwan','alhakam','mutarrif','musab','wahb','rifaa',
                      'azzuraqi','ata','mabad','warrad','ashshaibani','abis','umm','salmanalfarsi','tawus','salman','umama',
                      'fatima','almundhir','taghlib','hummaid','almiswar','makhrama','shuaib','althaqafi','masruq','namir',
                      'sharik','addahhak','zahdam','raja','alutaridi']

In [5]:
final_stopword_list = custom_stopword_list + NLTK_stop_words_list
print(final_stopword_list)
print(len(final_stopword_list))

['narrated', 'narrator', 'umar', 'bin', 'alkhattab', 'abu', 'aisha', 'etc', 'ie', 'us', 'narrate', 'narrates', 'narrating', 'narration', 'narrations', 'narrators', 'said', 'abbas', 'aas', 'abul', 'aba', 'amr', 'abdullah', 'huraira', 'ibn', 'jabir', 'jubair', 'alansari', 'musa', 'abul', 'abulqasim', 'anas', 'ubada', 'assamit', 'alkhudri', 'sad', 'almarur', 'alahnaf', 'qais', 'albara', 'azib', 'talha', 'ubaidullah', 'annuman', 'bashir', 'jamra', 'masud', 'abi', 'waqqas', 'jarir', 'ziyad', 'ilaqa', 'malik', 'waqid', 'allaithi', 'abdurrahman', 'bakra', 'father', 'wail', 'muawiya', 'mahmud', 'rabia', 'alaas', 'asma', 'mulaika', 'alansari', 'zaid', 'khalid', 'aljuhani', 'burda', 'alas', 'ali', 'azzubair', 'salama', 'ashhubi', 'um', 'ammar', 'aswad', 'attufail', 'nafi', 'nuam', 'almujmir', 'tamim', 'kuraib', 'usama', 'ata', 'yasar', 'aiyub', 'qatada', 'humran', 'urwa', 'uthman', 'affan', 'muhammad', 'juraij', 'ubaid', 'umatiya', 'seereen', 'hamza', 'adi', 'hatim', 'almughira', 'shuba', 'bakr'

In [6]:
#Try to implement into Hadith content (Test on 3 chapter dulu - Revelation, Belief & Knowledge)

#2-Read data into pandas dataframe
df = pd.read_csv(r"C:\Users\Acer\anaconda3\activity\Sahih_Bukhari.csv")
df

Unnamed: 0,id,hadith_id,source,chapter_no,hadith_no,chapter,text_en
0,0,1,Sahih Bukhari,1,1,Revelation,Narrated 'Umar bin Al-Khattab: ...
1,1,2,Sahih Bukhari,1,2,Revelation,Narrated 'Aisha: ...
2,2,3,Sahih Bukhari,1,3,Revelation,Narrated 'Aisha: (the m...
3,3,4,Sahih Bukhari,1,4,Revelation,Narrated Jabir bin 'Abdullah Al-Ansari while ...
4,4,5,Sahih Bukhari,1,5,Revelation,Narrated Said bin Jubair: ...
...,...,...,...,...,...,...,...
129,71,130,Sahih Bukhari,3,130,Knowledge,Narrated Um Salama: Um-Su...
130,72,131,Sahih Bukhari,3,131,Knowledge,Narrated `Abdullah bin `Umar: ...
131,73,132,Sahih Bukhari,3,132,Knowledge,Narrated `Ali: I used to ...
132,74,133,Sahih Bukhari,3,133,Knowledge,Narrated Nafi`: `Abdullah...


In [7]:
#3-Tokenization (split into tokens)

def word_tokenization(text):
    text = word_tokenize(text.lower())
    return " ".join([word for word in str(text).split()])

df["text_token"] = df["text_en"].apply(lambda text: word_tokenization(text))
df

Unnamed: 0,id,hadith_id,source,chapter_no,hadith_no,chapter,text_en,text_token
0,0,1,Sahih Bukhari,1,1,Revelation,Narrated 'Umar bin Al-Khattab: ...,"['narrated', ""'umar"", 'bin', 'al-khattab', ':'..."
1,1,2,Sahih Bukhari,1,2,Revelation,Narrated 'Aisha: ...,"['narrated', ""'aisha"", ':', '(', 'the', 'mothe..."
2,2,3,Sahih Bukhari,1,3,Revelation,Narrated 'Aisha: (the m...,"['narrated', ""'aisha"", ':', '(', 'the', 'mothe..."
3,3,4,Sahih Bukhari,1,4,Revelation,Narrated Jabir bin 'Abdullah Al-Ansari while ...,"['narrated', 'jabir', 'bin', ""'abdullah"", 'al-..."
4,4,5,Sahih Bukhari,1,5,Revelation,Narrated Said bin Jubair: ...,"['narrated', 'said', 'bin', 'jubair', ':', 'ib..."
...,...,...,...,...,...,...,...,...
129,71,130,Sahih Bukhari,3,130,Knowledge,Narrated Um Salama: Um-Su...,"['narrated', 'um', 'salama', ':', 'um-sulaim',..."
130,72,131,Sahih Bukhari,3,131,Knowledge,Narrated `Abdullah bin `Umar: ...,"['narrated', '`', 'abdullah', 'bin', '`', 'uma..."
131,73,132,Sahih Bukhari,3,132,Knowledge,Narrated `Ali: I used to ...,"['narrated', '`', 'ali', ':', 'i', 'used', 'to..."
132,74,133,Sahih Bukhari,3,133,Knowledge,Narrated Nafi`: `Abdullah...,"['narrated', 'nafi', '`', ':', '`', 'abdullah'..."


In [8]:
#4-Removing punctuations 
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

df["text_punc"] = df['text_token'].apply(remove_punctuations)
df

Unnamed: 0,id,hadith_id,source,chapter_no,hadith_no,chapter,text_en,text_token,text_punc
0,0,1,Sahih Bukhari,1,1,Revelation,Narrated 'Umar bin Al-Khattab: ...,"['narrated', ""'umar"", 'bin', 'al-khattab', ':'...",narrated umar bin alkhattab i heard allah s a...
1,1,2,Sahih Bukhari,1,2,Revelation,Narrated 'Aisha: ...,"['narrated', ""'aisha"", ':', '(', 'the', 'mothe...",narrated aisha the mother of the faithful be...
2,2,3,Sahih Bukhari,1,3,Revelation,Narrated 'Aisha: (the m...,"['narrated', ""'aisha"", ':', '(', 'the', 'mothe...",narrated aisha the mother of the faithful be...
3,3,4,Sahih Bukhari,1,4,Revelation,Narrated Jabir bin 'Abdullah Al-Ansari while ...,"['narrated', 'jabir', 'bin', ""'abdullah"", 'al-...",narrated jabir bin abdullah alansari while tal...
4,4,5,Sahih Bukhari,1,5,Revelation,Narrated Said bin Jubair: ...,"['narrated', 'said', 'bin', 'jubair', ':', 'ib...",narrated said bin jubair ibn abbas in the exp...
...,...,...,...,...,...,...,...,...,...
129,71,130,Sahih Bukhari,3,130,Knowledge,Narrated Um Salama: Um-Su...,"['narrated', 'um', 'salama', ':', 'um-sulaim',...",narrated um salama umsulaim came to allah s a...
130,72,131,Sahih Bukhari,3,131,Knowledge,Narrated `Abdullah bin `Umar: ...,"['narrated', '`', 'abdullah', 'bin', '`', 'uma...",narrated abdullah bin umar once allah s apo...
131,73,132,Sahih Bukhari,3,132,Knowledge,Narrated `Ali: I used to ...,"['narrated', '`', 'ali', ':', 'i', 'used', 'to...",narrated ali i used to get the emotional ure...
132,74,133,Sahih Bukhari,3,133,Knowledge,Narrated Nafi`: `Abdullah...,"['narrated', 'nafi', '`', ':', '`', 'abdullah'...",narrated nafi abdullah bin umar said a m...


In [9]:
#5-Removing stopwords 
stops = set(final_stopword_list)
def remove_stopwords(text):
    return " ".join([word for word in str(text).split()
                    if word not in stops])

df["text_removestopword"] = df["text_punc"].apply(lambda text: remove_stopwords(text))
df

Unnamed: 0,id,hadith_id,source,chapter_no,hadith_no,chapter,text_en,text_token,text_punc,text_removestopword
0,0,1,Sahih Bukhari,1,1,Revelation,Narrated 'Umar bin Al-Khattab: ...,"['narrated', ""'umar"", 'bin', 'al-khattab', ':'...",narrated umar bin alkhattab i heard allah s a...,heard allah apostle saying reward deeds depend...
1,1,2,Sahih Bukhari,1,2,Revelation,Narrated 'Aisha: ...,"['narrated', ""'aisha"", ':', '(', 'the', 'mothe...",narrated aisha the mother of the faithful be...,mother faithful believers asked allah apostle ...
2,2,3,Sahih Bukhari,1,3,Revelation,Narrated 'Aisha: (the m...,"['narrated', ""'aisha"", ':', '(', 'the', 'mothe...",narrated aisha the mother of the faithful be...,mother faithful believers commencement divine ...
3,3,4,Sahih Bukhari,1,4,Revelation,Narrated Jabir bin 'Abdullah Al-Ansari while ...,"['narrated', 'jabir', 'bin', ""'abdullah"", 'al-...",narrated jabir bin abdullah alansari while tal...,talking period pause revelation reporting spee...
4,4,5,Sahih Bukhari,1,5,Revelation,Narrated Said bin Jubair: ...,"['narrated', 'said', 'bin', 'jubair', ':', 'ib...",narrated said bin jubair ibn abbas in the exp...,explanation statement allah move tongue concer...
...,...,...,...,...,...,...,...,...,...,...
129,71,130,Sahih Bukhari,3,130,Knowledge,Narrated Um Salama: Um-Su...,"['narrated', 'um', 'salama', ':', 'um-sulaim',...",narrated um salama umsulaim came to allah s a...,umsulaim came allah apostle verily allah shy t...
130,72,131,Sahih Bukhari,3,131,Knowledge,Narrated `Abdullah bin `Umar: ...,"['narrated', '`', 'abdullah', 'bin', '`', 'uma...",narrated abdullah bin umar once allah s apo...,allah apostle amongst trees tree leaves fall l...
131,73,132,Sahih Bukhari,3,132,Knowledge,Narrated `Ali: I used to ...,"['narrated', '`', 'ali', ':', 'i', 'used', 'to...",narrated ali i used to get the emotional ure...,used get emotional urethral discharge frequent...
132,74,133,Sahih Bukhari,3,133,Knowledge,Narrated Nafi`: `Abdullah...,"['narrated', 'nafi', '`', ':', '`', 'abdullah'...",narrated nafi abdullah bin umar said a m...,man got mosque allah apostle place order assum...


In [10]:
#6-Stemming (valid stems)
porter = PorterStemmer()
def stemming(text):
    token_words=word_tokenize(text)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

df['text_cleaned'] = df['text_removestopword'].apply(lambda text: stemming(text))  
#df = df.drop(columns=['unstemmed']) # Get rid of the unstemmed column.
df 

Unnamed: 0,id,hadith_id,source,chapter_no,hadith_no,chapter,text_en,text_token,text_punc,text_removestopword,text_cleaned
0,0,1,Sahih Bukhari,1,1,Revelation,Narrated 'Umar bin Al-Khattab: ...,"['narrated', ""'umar"", 'bin', 'al-khattab', ':'...",narrated umar bin alkhattab i heard allah s a...,heard allah apostle saying reward deeds depend...,heard allah apostl say reward deed depend upon...
1,1,2,Sahih Bukhari,1,2,Revelation,Narrated 'Aisha: ...,"['narrated', ""'aisha"", ':', '(', 'the', 'mothe...",narrated aisha the mother of the faithful be...,mother faithful believers asked allah apostle ...,mother faith believ ask allah apostl allah apo...
2,2,3,Sahih Bukhari,1,3,Revelation,Narrated 'Aisha: (the m...,"['narrated', ""'aisha"", ':', '(', 'the', 'mothe...",narrated aisha the mother of the faithful be...,mother faithful believers commencement divine ...,mother faith believ commenc divin inspir allah...
3,3,4,Sahih Bukhari,1,4,Revelation,Narrated Jabir bin 'Abdullah Al-Ansari while ...,"['narrated', 'jabir', 'bin', ""'abdullah"", 'al-...",narrated jabir bin abdullah alansari while tal...,talking period pause revelation reporting spee...,talk period paus revel report speech prophet w...
4,4,5,Sahih Bukhari,1,5,Revelation,Narrated Said bin Jubair: ...,"['narrated', 'said', 'bin', 'jubair', ':', 'ib...",narrated said bin jubair ibn abbas in the exp...,explanation statement allah move tongue concer...,explan statement allah move tongu concern qura...
...,...,...,...,...,...,...,...,...,...,...,...
129,71,130,Sahih Bukhari,3,130,Knowledge,Narrated Um Salama: Um-Su...,"['narrated', 'um', 'salama', ':', 'um-sulaim',...",narrated um salama umsulaim came to allah s a...,umsulaim came allah apostle verily allah shy t...,umsulaim came allah apostl verili allah shi te...
130,72,131,Sahih Bukhari,3,131,Knowledge,Narrated `Abdullah bin `Umar: ...,"['narrated', '`', 'abdullah', 'bin', '`', 'uma...",narrated abdullah bin umar once allah s apo...,allah apostle amongst trees tree leaves fall l...,allah apostl amongst tree tree leav fall like ...
131,73,132,Sahih Bukhari,3,132,Knowledge,Narrated `Ali: I used to ...,"['narrated', '`', 'ali', ':', 'i', 'used', 'to...",narrated ali i used to get the emotional ure...,used get emotional urethral discharge frequent...,use get emot urethr discharg frequent request ...
132,74,133,Sahih Bukhari,3,133,Knowledge,Narrated Nafi`: `Abdullah...,"['narrated', 'nafi', '`', ':', '`', 'abdullah'...",narrated nafi abdullah bin umar said a m...,man got mosque allah apostle place order assum...,man got mosqu allah apostl place order assum i...


In [11]:
df.to_csv('sahihbukhariaftercleaned.csv')