## 1. Import Libraries and NLTK Setup
- Import pandas and numpy for data manipulation.
- Import scikit-learn libraries for text processing and similarity computation.
- Import nltk for natural language processing.
- Download required NLTK datasets for stopwords, tokenization, and lemmatization.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import os
import pickle
import gc

# Download required NLTK datasets
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)


## 2. Define BookRecommender Class
- The main class for building the book recommendation system.
- Initializes text preprocessing tools, TF-IDF vectorizer, and placeholders for the dataset and similarity matrix.

In [None]:
class BookRecommender:
    def __init__(self):
        # Initialize tools for text preprocessing and recommendation
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.df = None  # To store the dataset
        self.tfidf_matrix = None  # TF-IDF matrix for recommendations

# 3. Create Text Cleaning Method
- `_clean_text` processes the text data by:
   1. Converting text to lowercase.
   2. Removing numbers, punctuation, and stop words.
   3. Lemmatizing words for standardization.

In [None]:
    def _clean_text(self, text):
        # Convert text to lowercase
        text = str(text).lower()
        # Remove numbers and unwanted characters
        text = re.sub(r'\([^)]*\)|\d+', '', text)
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Tokenize, remove stop words, and lemmatize
        tokens = [
            self.lemmatizer.lemmatize(word)
            for word in nltk.word_tokenize(text)
            if word not in self.stop_words and len(word) > 2
        ]
        # Join the tokens back into a single string
        return ' '.join(tokens)


## 4. Fit the Recommender Model
- The `fit` method:
   1. Prepares the dataset by handling missing values.
   2. Cleans and combines text columns (Title, Description, Category).
   3. Creates a TF-IDF matrix for similarity computations.

In [None]:
    def fit(self, df):
        print("Processing dataset...")
        # Make a copy of the dataset
        self.df = df.copy()
        
        # Fill missing values in important columns
        self.df['Description'] = self.df['Description'].fillna('')
        self.df['Category'] = self.df['Category'].fillna('')
        
        # Combine and clean text data
        print("Cleaning text...")
        self.df['processed_content'] = (
            self.df['Title'].fillna('') + ' ' +
            self.df['Description'] + ' ' +
            self.df['Category']
        ).apply(self._clean_text)
        
        # Create the TF-IDF matrix
        print("Creating TF-IDF matrix...")
        self.tfidf_matrix = self.vectorizer.fit_transform(self.df['processed_content'])
        
        # Clean up unused memory
        gc.collect()
        print("Model ready!")


## 5. Generate Book Recommendations
 - The `get_recommendations` method:
   1. Finds the index of the given book in the dataset.
   2. Computes similarity scores with other books using cosine similarity.
   3. Returns the top N similar books with their details.

In [None]:
    def get_recommendations(self, book_title, n=5):
        # Find the index of the book based on title
        idx = self.df[self.df['Title'].str.contains(book_title, case=False, na=False)].index
        if len(idx) == 0:
            return f"Book '{book_title}' not found."
        
        idx = idx[0]
        
        # Compute cosine similarity scores
        sim_scores = cosine_similarity(
            self.tfidf_matrix[idx:idx+1], 
            self.tfidf_matrix
        ).flatten()
        
        # Exclude the input book and sort scores
        sim_scores[idx] = -1
        top_indices = sim_scores.argsort()[::-1][:n]
        
        # Retrieve details of the top recommendations
        recommendations = []
        for i in top_indices:
            recommendations.append({
                'Title': self.df.iloc[i]['Title'],
                'Author': self.df.iloc[i]['Authors'],
                'Category': self.df.iloc[i]['Category'],
                'Similarity Score': round(float(sim_scores[i]), 3)
            })
        return recommendations


# 6. Save and Load the Model
- `save_model`: Saves the trained model components to a file.
- `load_model`: Loads a previously saved model.

In [None]:
    def save_model(self, filepath):
        # Ensure the directory exists
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        # Save the model components
        with open(filepath, 'wb') as f:
            pickle.dump({
                'vectorizer': self.vectorizer,
                'df': self.df,
                'tfidf_matrix': self.tfidf_matrix
            }, f)
        print(f"Model saved to {filepath}")

    @staticmethod
    def load_model(filepath):
        # Load model components from a file
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        
        recommender = BookRecommender()
        recommender.vectorizer = data['vectorizer']
        recommender.df = data['df']
        recommender.tfidf_matrix = data['tfidf_matrix']
        
        return recommender


# 7. Main Execution for Testing
- This section demonstrates:
   1. Loading the dataset and training the recommender system.
   2. Saving the trained model for future use.
   3. Testing recommendations for sample books.

In [None]:
# Load dataset
print("Loading dataset...")
df = pd.read_csv('./dataset/book.csv')

# Create and train the recommender system
print("Creating recommender system...")
recommender = BookRecommender()
recommender.fit(df)

# Save the model
os.makedirs('./ml_model', exist_ok=True)
recommender.save_model('./ml_model/book_recommender.pkl')

# Test recommendations
test_books = ['The Hobbit', 'Harry Potter and the Sorcerer\'s Stone', '1984']

print("\nTesting recommendations:")
for book in test_books:
    print(f"\nRecommendations for '{book}':")
    recommendations = recommender.get_recommendations(book)
    
    if isinstance(recommendations, str):
        print(recommendations)
    else:
        for i, rec in enumerate(recommendations, 1):
            print(f"{i}. {rec['Title']} by {rec['Author']}")
            print(f"   Category: {rec['Category']}")
            print(f"   Similarity Score: {rec['Similarity Score']}")
