# Story Finder & Generator - Part 1
## Vectorizer, and Retrival
---
**IRS Innovative Assignment (Even 2023-24)** <br>
**Roll No. and Names:**<br>
&emsp;21BCE183 Parv Thacker <br>
&emsp;21BCE201 Kaju Patel <br>
&emsp;21BCE250 Tanvi Rathod <br>

---
---

# Required Installations 

In [1]:
!pip install gensim nltk

import nltk
nltk.download('wordnet')

!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/co

# Importing Libraries

In [2]:
# Basic
import pandas as pd

# Extract Data - JSON
from pathlib import Path
import json

#Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#selection of response
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#Scrapping
import requests
from bs4 import BeautifulSoup
from pprint import pprint

# Hide LLM usage warnings
import warnings
warnings.filterwarnings('ignore')

# Modules

## Extract Data (from Dataset)

In [3]:
def extract_data_json():

    p = Path(r'/kaggle/input/goodreads-reviews-and-details-2017/goodreads_reviews_spoiler_raw.json')

    json_data = []

    # Read JSON
    with p.open('r', encoding='utf-8') as f:
        for line in f:
            json_data.append(json.loads(line))

    # Create dataframe
    df = pd.json_normalize(json_data)
    
    # Extract specific columns: 'book_id', 'review_text', 'n_votes'
    desired_columns = ['book_id', 'review_text', 'n_votes']
    df_subset = df[desired_columns]
    
    return df_subset

def extract_data_csv():
    reviews_df = pd.read_csv('/kaggle/input/goodreads-review-manual-csv/reviews.csv')
    reviews_df.rename(columns={'review': 'review_text', 'vote_count': 'n_votes'}, inplace=True)
    return reviews_df


## Convert Reviews to Vectors - Doc2Vec

In [4]:
def preprocess_text(text):
    
    # Case Folding
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization
    tokens = word_tokenize(text)
    
#     # Stop Words Removal
#     stop_words = set(stopwords.words('english'))
#     filtered_tokens = [token for token in tokens if token not in stop_words]

#     # Lemmatizer
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return tokens

def train_doc2vec_model(df, vector_size=500, window=10, min_count=1, epochs=5):
    # Data
    tagged_data = [TaggedDocument(words=words, tags=[idx]) for idx, words in enumerate(df['preprocessed_text'])]
    
    # Model
    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, dm=0)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model

def infer_doc2vec_vectors(model, texts):
    return texts.apply(lambda x: model.infer_vector(x))


## Driver Code - Doc2Vec

In [5]:
def process_reviews(df):
    # Preprocess text
    df['preprocessed_text'] = df['review_text'].apply(preprocess_text)
    
    # Train Doc2Vec model
    model = train_doc2vec_model(
        df = df, 
        vector_size=500, 
        window=10, 
        min_count=1, 
        epochs=5
    )
    
    # Infer Doc2Vec vectors
    df['doc2vec'] = infer_doc2vec_vectors(
        model = model, 
        texts = df['preprocessed_text'])
    
    # Group by 'book_id' and calculate mean of doc2vec vectors
    df_merged = df.groupby('book_id').agg({
        'review_text': 'first',
        'n_votes': 'max',
        'doc2vec': 'mean'
    }).reset_index()
    
    return df_merged, model

# Filters to remove small reviews
def filter_reviews_by_length(df, min_length_chars=1000, min_length_words=100):
    filtered_df = df[(df['review_text'].apply(len) > min_length_chars) |
                     (df['review_text'].apply(lambda x: len(x.split())) > min_length_words)]
    return filtered_df

## Retrival Model

In [6]:
class ReviewSimilarityFinder:
    
    def __init__(self, model, df, vector_size=500, window=10, min_count=1, epochs=5):
        self.df = df
        self.model = model
    
    def find_similar_reviews(self, user_query_text, top_k=5):
        user_query_tokens = preprocess_text(user_query_text)
        user_query_vector = self.model.infer_vector(user_query_tokens)
        
        self.df['cosine_similarity'] = self.df['review_text'].apply(lambda x: cosine_similarity([self.model.infer_vector(preprocess_text(x))], [user_query_vector])[0][0])
        
        top_k_similar_reviews = self.df.sort_values(by='cosine_similarity', ascending=False).head(top_k)
        
        top_k_book_ids = top_k_similar_reviews['book_id'].tolist()
        top_k_review_texts = top_k_similar_reviews['review_text'].tolist()
        
        # Return
        return top_k_book_ids, top_k_review_texts
    
    def display_similar_reviews(self, top_k_book_ids, top_k_review_texts):
        for book_id, review_text in zip(top_k_book_ids, top_k_review_texts):
            print(f"Book ID: {book_id}")
            print(f"Review Text: {review_text}")
            print()
            
    def display_top_similar_reviews(self, top_k_book_ids, top_k_review_texts):
        for rank, (book_id, review_text) in enumerate(zip(top_k_book_ids, top_k_review_texts), start=1):
            goodreads_url = f"https://www.goodreads.com/book/show/{book_id}"
            formatted_review_text = review_text.replace("\n", "")
            
            print(f"\nSelect {rank} for:\n")
            print(f"Goodreads:       {goodreads_url}")
            print(f"Review Text:     {formatted_review_text}")
            print("-" * 50)
    
    


## Selection

In [7]:
def get_valid_number_input(prompt, low, high):
    while True:
        try:
            number = int(input(prompt))
            if low <= number <= high:
                return number
            else:
                print(f"Please enter a number between {low} and {high}.")
        except ValueError:
            print("Invalid input. Please enter an integer.")


## Web Scraper

In [8]:
def extract_book_info(selected_book_id = None, url = None):
    
    # URL of the webpage containing the book information
    url = 'https://www.goodreads.com/book/show/' + (str)(selected_book_id)

    book_info = {}

    response = requests.get(url)
    
    if response.status_code == 200:

        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Title
        title_element = soup.find('h1', class_='Text Text__title1')
        if title_element:
            book_title = title_element.text.strip()
            book_info["title"] = book_title
        else:
            book_info["title"] = "Information not found."
        
        # Rating
        rating_div = soup.find('div', class_='RatingStatistics__rating')
        if rating_div:
            rating = rating_div.text.strip()
            book_info["rating"] = float(rating)
        else:
            book_info["rating"] = "Information not found."
        
        # Rating count and review count
        meta_div = soup.find('div', class_='RatingStatistics__meta')
        if meta_div:
            ratings_count = meta_div.find('span', {'data-testid': 'ratingsCount'}).text.strip()
            reviews_count = meta_div.find('span', {'data-testid': 'reviewsCount'}).text.strip()
            book_info["ratings_count"] = int(ratings_count[:-8].replace(",", ""))
            book_info["reviews_count"] = int(reviews_count[:-8].replace(",", ""))
        else:
            book_info["ratings_count"] = "Information not found."
            book_info["reviews_count"] = "Information not found."
            
        # Description
        description_div = soup.find('div', class_='DetailsLayoutRightParagraph__widthConstrained')
        if description_div:
            description = description_div.text.strip()
            book_info["description"] = description
        else:
            book_info["description"] = "Description not found."
            
    else:
        book_info["status_code"] = response.status_code
        book_info["message"] = "Failed to retrieve data."

    return book_info

---
---

# Driver Code

## Get Data

In [9]:
if __name__ == "__main__":
    # Get Data
    df = df1 = extract_data_json()
    df2 = extract_data_csv()
    df = pd.concat([df1, df2], ignore_index=True)

## Process Data & Doc2Vec

In [10]:
if __name__ == "__main__":
    df_ = df.head(100000)
        
    # Process Reviews  
    df_filtered = filter_reviews_by_length(df_, min_length_chars=1000, min_length_words=100)
    df_processed, model = process_reviews(df_filtered)
    
    print(df_filtered.head())

    book_id                                        review_text  n_votes  \
0  18245960  This is a special book. It started slow for ab...       28   
2  28684704  A fun, fast paced science fiction thriller. I ...       22   
4  25884323  I really enjoyed this book, and there is a lot...        9   
5  19398490  A beautiful story. It is rare to encounter a b...       35   
8  22551730  Another hard to put down nonfiction book from ...       20   

                                   preprocessed_text  \
0  [this, is, a, special, book, it, started, slow...   
2  [a, fun, fast, paced, science, fiction, thrill...   
4  [i, really, enjoyed, this, book, and, there, i...   
5  [a, beautiful, story, it, is, rare, to, encoun...   
8  [another, hard, to, put, down, nonfiction, boo...   

                                             doc2vec  
0  [-0.09230338, 0.0180572, 0.020138972, 0.062459...  
2  [0.073632814, 0.03159117, 0.038643252, 0.05672...  
4  [0.070863, -0.17642756, 0.18815583, 0.036307

## Initialize Similarity Finder

In [11]:
# Initialize ReviewSimilarityFinder instance
review_finder = ReviewSimilarityFinder(model, df_processed)

## Find Similar

In [12]:
# User Query
user_query_text = "Chinese Mythology with aliens and science twist"

In [13]:
# Find top 5 similar reviews based on user query
top_5_book_ids, top_5_review_texts = review_finder.find_similar_reviews(user_query_text)

# Display results
top_5_book_ids, top_5_review_texts = review_finder.find_similar_reviews(user_query_text)

# Display top 5 similar reviews
review_finder.display_top_similar_reviews(top_5_book_ids, top_5_review_texts)


Select 1 for:

Goodreads:       https://www.goodreads.com/book/show/41821
Review Text:     This novel was born of someone mentioning an isotope that couldn't exist in our universe. Out of that morsel Asimov managed to write a novel exploring parallel universes, alien societies, the Big Bang, space exploration and lunar colonies.  This is classic Asimov proving why he was so prolific, turning a basic question into a fully fledge hypothesis in the form of a novel. Typically for him, the prose isn't the strongest and the final third of the book is rather weak, but for some solid science fiction (and extra points for the excellent use of a Schiller quote) you could do worse than this.
--------------------------------------------------

Select 2 for:

Goodreads:       https://www.goodreads.com/book/show/17167572
Review Text:     To start off with, the title is a fallacy: there's no war here, long or otherwise.  Instead, the book has the exact same problem as the original: It's a dozen or s

## Select & Get Data from Web

In [14]:
if __name__ == "__main__":
    top_k = 5
    
    # Prompt selection
    number = get_valid_number_input(f"Please enter a number between 1 and {top_k}: ", 1, top_k)
    selected_book_id = top_5_book_ids[number-1]
    
    print("You selected:", number)

Please enter a number between 1 and 5:  1


You selected: 1


In [15]:
# Scrape book data
book_data = extract_book_info(selected_book_id)

In [16]:
print ("You have Selected:")
print (f"Title:           {book_data['title']}")
print (f"Rating:          {book_data['rating']} • ({book_data['ratings_count']} Ratings & {book_data['reviews_count']} Reviews)")
print (f"Description:     {book_data['description']}")

You have Selected:
Title:           The Gods Themselves
Rating:          4.09 • (63494 Ratings & 2673 Reviews)
Description:     In the twenty-second century Earth obtains limitless, free energy from a source science little understands: an exchange between Earth and a parallel universe, using a process devised by the aliens. But even free energy has a price. The transference process itself will eventually lead to the destruction of the Earth's Sun--and of Earth itself.Only a few know the terrifying truth--an outcast Earth scientist, a rebellious alien inhabitant of a dying planet, a lunar-born human intuitionist who senses the imminent annihilation of the Sun.  They know the truth--but who will listen?  They have foreseen the cost of abundant energy--but who will believe?  These few beings, human and alien, hold the key to the Earth's survival.


## Copy this string for Part 2

In [17]:
book_data

{'title': 'The Gods Themselves',
 'rating': 4.09,
 'ratings_count': 63494,
 'reviews_count': 2673,
 'description': "In the twenty-second century Earth obtains limitless, free energy from a source science little understands: an exchange between Earth and a parallel universe, using a process devised by the aliens. But even free energy has a price. The transference process itself will eventually lead to the destruction of the Earth's Sun--and of Earth itself.Only a few know the terrifying truth--an outcast Earth scientist, a rebellious alien inhabitant of a dying planet, a lunar-born human intuitionist who senses the imminent annihilation of the Sun.\xa0\xa0They know the truth--but who will listen?\xa0\xa0They have foreseen the cost of abundant energy--but who will believe?\xa0\xa0These few beings, human and alien, hold the key to the Earth's survival."}

# Display Datasets

In [21]:
df1

Unnamed: 0,book_id,review_text,n_votes
0,18245960,This is a special book. It started slow for ab...,28
1,16981,Recommended by Don Katz. Avail for free in Dec...,1
2,28684704,"A fun, fast paced science fiction thriller. I ...",22
3,27161156,Recommended reading to understand what is goin...,5
4,25884323,"I really enjoyed this book, and there is a lot...",9
...,...,...,...
1378028,15745950,Can't wait for Travis' POV \n Travis Before Ab...,0
1378029,10861195,Had this on my to-read shelf forever. Will upd...,0
1378030,6131164,The last book left me wanting for more. I need...,0
1378031,10025305,Things are heating up in the second novel of I...,0


In [22]:
df2

Unnamed: 0,book_id,review_text,n_votes
0,14935,"Money. It's all about the money. I mean, why e...",0
1,14935,I love Jane Austen. I LOVE Jane Austen. I LOVE...,1800
2,14935,*life goals: to be an Eleanor*reality: being a...,535
3,14935,While I enjoyed the relationship between the s...,419
4,14935,I'm not a fan of Jane Austen. I've given her m...,390
...,...,...,...
445,16130398,"I wouldn't classify this as a romance, but got...",0
446,16130398,Fascinating historical fiction account of Poe ...,9
447,16130398,Mrs. Poe by Lynn Cullen is a 2013 Gallery Book...,0
448,16130398,Read This Review & More Like It At Ageless Pag...,0


In [23]:
df

Unnamed: 0,book_id,review_text,n_votes
0,18245960,This is a special book. It started slow for ab...,28
1,16981,Recommended by Don Katz. Avail for free in Dec...,1
2,28684704,"A fun, fast paced science fiction thriller. I ...",22
3,27161156,Recommended reading to understand what is goin...,5
4,25884323,"I really enjoyed this book, and there is a lot...",9
...,...,...,...
1378478,16130398,"I wouldn't classify this as a romance, but got...",0
1378479,16130398,Fascinating historical fiction account of Poe ...,9
1378480,16130398,Mrs. Poe by Lynn Cullen is a 2013 Gallery Book...,0
1378481,16130398,Read This Review & More Like It At Ageless Pag...,0
