In [2]:
#PREPROCESS STEP 

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

# DATA 
train_data = pd.read_csv("C:\\Users\\nh013\\Desktop\\Religious Books (Islam and Christanity)\\books_Islam.csv")
test_data = pd.read_csv("C:\\Users\\nh013\\Desktop\\Religious Books (Islam and Christanity)\\books_Christanity.csv")

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        #REMOVE URLS
        text = re.sub(r'http\S+', '', text)

        # REMOVE SPECIAL CHERECTER AND CONVERT TO LOWER CASE
        text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

        # TOKENIZE TEXT INTO WORDS
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS
        words = [word for word in words if word not in stopwords.words('english')]

        # DEFINE STEMMER AND LEMMATIZER
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        # APPLY STEMMER AND LEMMATIZER
        words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words]

        # JOIN THE WORDS BACKINTO A SINGLE STRING  
        processed_text = ' '.join(words)

        return processed_text
    else:
        return text  # RETURN THE INPUT UNCHANGED IF IT'S NOT A STRING  
    
# PERFORM PREPROCESSING TO TRAIN AND TEST DATAFRAME 
train_data['title'] = train_data['title'].apply(preprocess_text)
train_data['author'] = train_data['author'].apply(preprocess_text)
train_data['description'] = train_data['description'].apply(preprocess_text)
train_data['genres'] = train_data['genres'].apply(preprocess_text)

test_data['title'] = test_data['title'].apply(preprocess_text)
test_data['author'] = test_data['author'].apply(preprocess_text)
test_data['description'] = test_data['description'].apply(preprocess_text)
test_data['genres'] = test_data['genres'].apply(preprocess_text)


print("Preprocessed Train Data:")
print(train_data)

print("\nPreprocessed Test Data:")
print(test_data)


Preprocessed Train Data:
                                                 title  \
0                                                        
1                    muhammad life base earliest sourc   
2                                                        
3              destini disrupt histori world islam eye   
4    reclaim heart person insight break free life s...   
..                                                 ...   
735              prophet pulpit commentari state islam   
736                                                      
737                                                      
738  conflict fit islam america evolutionari psycholog   
739                        scientif decept new atheist   

                        author  \
0                       anonym   
1                  martin ling   
2    safiur rahman mubarakpuri   
3                 tamim ansari   
4                 yasmin mogah   
..                         ...   
735         khale abou el fadl   
736   

In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# DATA 
train_data = pd.read_csv("C:\\Users\\nh013\\Desktop\\Religious Books (Islam and Christanity)\\books_Islam.csv")
test_data = pd.read_csv("C:\\Users\\nh013\\Desktop\\Religious Books (Islam and Christanity)\\books_Christanity.csv")

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    if isinstance(text, str):  # CHECK IF THE INPUT IS STRING 
        # REMOVE URLS
        text = re.sub(r'http\S+', '', text)

        # REMOVE SPECIAL CHARACTERS AND CONVERT TO LOWER CASE
        text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

        # TOKENIZE TEXT INTO WORDS
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS
        words = [word for word in words if word not in stopwords.words('english')]

        # DEFINE STEMMER AND LEMMATIZER
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        # APPLY STEMMER AND LEMMATIZER
        words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words]

        # JOIN THE WORDS BACK INTO A SINGLE STRING  
        processed_text = ' '.join(words)

        return processed_text
    else:
        return ""  

# PERFORM PREPROCESSING TO TRAIN AND TEST DATAFRAME 
train_data['title'] = train_data['title'].apply(preprocess_text)
train_data['author'] = train_data['author'].apply(preprocess_text)
train_data['description'] = train_data['description'].apply(preprocess_text)
train_data['genres'] = train_data['genres'].apply(preprocess_text)

test_data['title'] = test_data['title'].apply(preprocess_text)
test_data['author'] = test_data['author'].apply(preprocess_text)
test_data['description'] = test_data['description'].apply(preprocess_text)
test_data['genres'] = test_data['genres'].apply(preprocess_text)

# TEXT VECTORIZATION USING TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000, use_idf=True)
tfidf_train_title = tfidf_vectorizer.fit_transform(train_data['title'])
tfidf_train_author = tfidf_vectorizer.fit_transform(train_data['author'])
tfidf_train_description = tfidf_vectorizer.fit_transform(train_data['description'])
tfidf_train_genres = tfidf_vectorizer.fit_transform(train_data['genres'])

tfidf_test_title = tfidf_vectorizer.transform(test_data['title'])
tfidf_test_author = tfidf_vectorizer.transform(test_data['author'])
tfidf_test_description = tfidf_vectorizer.transform(test_data['description'])
tfidf_test_genres = tfidf_vectorizer.transform(test_data['genres'])




print("Missing values in train_data:")
print(train_data.isnull().sum())


print("\nMissing values in test_data:")
print(test_data.isnull().sum())


train_data.dropna(subset=['title', 'author', 'description','pages', 'genres'], inplace=True)

test_data.dropna(subset=['title', 'author', 'description', 'genres'], inplace=True)





print("TF-IDF Vectorized Features for Train Data (Title):")
print(tfidf_train_title.toarray())

print("\nTF-IDF Vectorized Features for Train Data (Author):")
print(tfidf_train_author.toarray())

print("\nTF-IDF Vectorized Features for Train Data (Description):")
print(tfidf_train_description.toarray())

print("\nTF-IDF Vectorized Features for Train Data (Genres):")
print(tfidf_train_genres.toarray())

print("\nTF-IDF Vectorized Features for Test Data (Title):")
print(tfidf_test_title.toarray())

print("\nTF-IDF Vectorized Features for Test Data (Author):")
print(tfidf_test_author.toarray())

print("\nTF-IDF Vectorized Features for Test Data (Description):")
print(tfidf_test_description.toarray())

print("\nTF-IDF Vectorized Features for Test Data (Genres):")
print(tfidf_test_genres.toarray())


Missing values in train_data:
title          0
author         0
description    0
genres         0
pages          2
rating         0
dtype: int64

Missing values in test_data:
title          0
author         0
description    0
genres         0
pages          0
rating         0
dtype: int64
TF-IDF Vectorized Features for Train Data (Title):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

TF-IDF Vectorized Features for Train Data (Author):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

TF-IDF Vectorized Features for Train Data (Description):
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.        

In [2]:
#Build a Recurrent Neural Network (RNN) model to predict book ratings and suggest books based on their ratings

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# DATA
train_data = pd.read_csv("C:\\Users\\nh013\\Desktop\\Religious Books (Islam and Christanity)\\books_Islam.csv")
test_data = pd.read_csv("C:\\Users\\nh013\\Desktop\\Religious Books (Islam and Christanity)\\books_Christanity.csv")

# FUNCTION TO PREPROCESS TEXT

def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
        words = nltk.word_tokenize(text)
        words = [word for word in words if word not in stopwords.words('english')]
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words]
        processed_text = ' '.join(words)
        return processed_text
    else:
        return ""

# PERFORM PREPROCESS TO TRAIN AND TEST SET
train_data['title'] = train_data['title'].apply(preprocess_text)
train_data['author'] = train_data['author'].apply(preprocess_text)
train_data['description'] = train_data['description'].apply(preprocess_text)
train_data['genres'] = train_data['genres'].apply(preprocess_text)

test_data['title'] = test_data['title'].apply(preprocess_text)
test_data['author'] = test_data['author'].apply(preprocess_text)
test_data['description'] = test_data['description'].apply(preprocess_text)
test_data['genres'] = test_data['genres'].apply(preprocess_text)

# TOKENIZATION
max_words = 1000  
tokenizer = Tokenizer(num_words=max_words, split=' ')
tokenizer.fit_on_texts(train_data['description'])
X_train = tokenizer.texts_to_sequences(train_data['description'])
X_test = tokenizer.texts_to_sequences(test_data['description'])

#PADDING SEQUENCE
max_sequence_length = 500  # Define the maximum sequence length
X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

#BUILD RNN MODEL
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(1, activation='linear')) 

model.compile(loss='mean_squared_error', optimizer='adam')

# TRAIN MODEL TO PREDICT BOOK RATING 
y_train = train_data['rating']
y_test = test_data['rating']

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

# EVALUATE MODEL ON THE TEST DATA
test_loss = model.evaluate(X_test, y_test)
print(f'Test loss: {test_loss}')

# MAKE PREDICTION ON TEST DATA
predictions = model.predict(X_test)

# SUGGEST BOOKS BASED ON THEIR PREDICTION RATING 
suggested_books = test_data.copy()
suggested_books['predicted_rating'] = predictions

# SUGGESTES BOOKBY PREDICTES_RATING TO SUGGEST HIGHEST_RATED BOOKS 
suggested_books = suggested_books.sort_values(by='predicted_rating', ascending=False)


print(suggested_books[['title', 'predicted_rating']])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.054866790771484375
                                                 title  predicted_rating
197            book mormon anoth testament jesu christ          4.258873
118                                    dark night soul          4.258873
393                                     case real jesu          4.258873
139                          dont enough faith atheist          4.258873
760                                                             4.258873
..                                                 ...               ...
570     next evangel free church western cultur captiv          4.011279
95             sever merci stori faith tragedi triumph          4.010810
528  next right thing simpl soul practic make life ...          4.006959
547                  abraham journey heart three faith          3.991318
442  jesu realli love gay christian pilgrimag searc...