In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib

In [2]:
df = pd.read_csv('../data/books.csv', on_bad_lines='skip')

In [3]:
df = df.rename(columns={df.columns[7]: 'num_pages'})

In [11]:
# Filter for English language books
english_books = df[df['language_code'].str.contains('eng', na=False)].copy()
english_books['title'] = english_books['title'].str.strip()

In [5]:
# Combine title and author
english_books['title_author'] = english_books['title'] + ' ' + english_books['authors']

In [6]:
tfidf = TfidfVectorizer(stop_words='english')
text_matrix = tfidf.fit_transform(english_books['title_author'])

In [7]:
# Normalize numerical features
scaler = StandardScaler()
num_features = english_books[['average_rating', 'num_pages', 'ratings_count', 'text_reviews_count']]
num_matrix = scaler.fit_transform(num_features)

In [8]:
# Combine text and numerical features
combined_matrix = np.hstack((text_matrix.toarray(), num_matrix))

In [9]:
# Save artifacts
joblib.dump(tfidf, '../backend/models/tfidf_vectorizer_eng.pkl')
joblib.dump(scaler, '../backend/models/scaler.pkl')
np.save('../backend/models/combined_features.npy', combined_matrix)
english_books.to_csv('../backend/models/books_data.csv', index=False)

In [10]:
english_books.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', 'num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher', 'title_author'],
      dtype='object')