# Downloading data and initial exlporation

## Importing libraries and datasets

In [1]:
# Libraries
! pip install openai
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sns
import openai
#import tiktoken
import time
from tqdm import tqdm



In [2]:
# Load the datasets
interactions = pd.read_csv('https://raw.githubusercontent.com/olivialaven/MGT502_project/refs/heads/main/interactions_train.csv')
items = pd.read_csv("https://raw.githubusercontent.com/olivialaven/MGT502_project/refs/heads/main/items.csv")

# Display the first rows of each dataset
display(interactions.head())
display(items.head())

Unnamed: 0,u,i,t
0,4456,8581,1687541000.0
1,142,1964,1679585000.0
2,362,3705,1706872000.0
3,1809,11317,1673533000.0
4,4384,1323,1681402000.0


Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i
0,Classification décimale universelle : édition ...,,9782871303336; 2871303339,Ed du CEFAL,Classification décimale universelle; Indexatio...,0
1,Les interactions dans l'enseignement des langu...,"Cicurel, Francine, 1947-",9782278058327; 2278058320,Didier,didactique--langue étrangère - enseignement; d...,1
2,Histoire de vie et recherche biographique : pe...,,2343190194; 9782343190198,L'Harmattan,Histoires de vie en sociologie; Sciences socia...,2
3,Ce livre devrait me permettre de résoudre le c...,"Mazas, Sylvain, 1980-",9782365350020; 236535002X; 9782365350488; 2365...,Vraoum!,Moyen-Orient; Bandes dessinées autobiographiqu...,3
4,Les années glorieuses : roman /,"Lemaitre, Pierre, 1951-",9782702180815; 2702180817; 9782702183618; 2702...,Calmann-Lévy,France--1945-1975; Roman historique; Roman fra...,4


In [3]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15291 entries, 0 to 15290
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Title       15291 non-null  object
 1   Author      12638 non-null  object
 2   ISBN Valid  14568 non-null  object
 3   Publisher   15266 non-null  object
 4   Subjects    13068 non-null  object
 5   i           15291 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 716.9+ KB


In [4]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87047 entries, 0 to 87046
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   u       87047 non-null  int64  
 1   i       87047 non-null  int64  
 2   t       87047 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.0 MB


In [5]:
n_users = interactions.u.nunique()
n_items = items.i.nunique() # CHECK
print(f'Number of users = {n_users}, \nNumber of books (in total, incl. unread books) = {n_items} \nNumber of interactions = {len(interactions)}')


Number of users = 7838, 
Number of books (in total, incl. unread books) = 15291 
Number of interactions = 87047


## Checking for duplicates

In [6]:
# Checking duplicates in the interactions and items datasets
interaction_duplicates = interactions.duplicated().sum()
items_duplicates = items.duplicated().sum()
print(f'Interaction data duplicates = {interaction_duplicates}, Item data duplicates = {items_duplicates}')

Interaction data duplicates = 2, Item data duplicates = 0


In [7]:
# Showing the duplicated rows
duplicate_rows = interactions[interactions.duplicated(keep=False)]  # keep=False to show all duplicates
print(f"Duplicate rows in interaction data:\n{duplicate_rows}")

Duplicate rows in interaction data:
          u      i             t
4156   7210  12290  1.718023e+09
6103   7210  12290  1.718023e+09
34656  1323  10037  1.700055e+09
44025  1323  10037  1.700055e+09


In [8]:
# Drop the duplicate
interactions.drop_duplicates(keep='first', inplace=True)

In [9]:
# Sanity check, making sure duplicates are dropped
interaction_duplicates = interactions.duplicated().sum()
items_duplicates = items.duplicated().sum()
print(f'Interaction data duplicates = {interaction_duplicates}, Item data duplicates = {items_duplicates}')

Interaction data duplicates = 0, Item data duplicates = 0


## Exploring unique and missing values

In [10]:
# Getting the unique values for each column
# -----------------------------------------

# For interactions DataFrame
unique_counts_interactions = {column: interactions[column].nunique() for column in interactions.columns}
unique_counts_interactions_df = pd.DataFrame(unique_counts_interactions, index=['Unique Values']).T
print("Unique Value Counts for interactions DataFrame:")
display(unique_counts_interactions_df)

# For items DataFrame
unique_counts_items = {column: items[column].nunique() for column in items.columns}
unique_counts_items_df = pd.DataFrame(unique_counts_items, index=['Unique Values']).T
print("\nUnique Value Counts for items DataFrame:")
display(unique_counts_items_df)

Unique Value Counts for interactions DataFrame:


Unnamed: 0,Unique Values
u,7838
i,15109
t,86768



Unique Value Counts for items DataFrame:


Unnamed: 0,Unique Values
Title,14576
Author,9357
ISBN Valid,14490
Publisher,4337
Subjects,11521
i,15291


In [11]:
# Getting the missing values for each column
# -----------------------------------------

# For interactions DataFrame
missing_interactions = interactions.isnull().sum()
missing_interactions_df = pd.DataFrame(missing_interactions, columns=['Missing Values']).T

# For items DataFrame
missing_items = items.isnull().sum()
missing_items_df = pd.DataFrame(missing_items, columns=['Missing Values']).T

# Display the DataFrames
print("Missing Values in interactions DataFrame:")
display(missing_interactions_df)

print("\nMissing Values in items DataFrame:")
display(missing_items_df)

Missing Values in interactions DataFrame:


Unnamed: 0,u,i,t
Missing Values,0,0,0



Missing Values in items DataFrame:


Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i
Missing Values,0,2653,723,25,2223,0


### Investigating entries where all columns (except Title) is NaN

In [12]:
# Getting the rows with all NaN
items[items['Author'].isnull() & items['Subjects'].isnull() & items['ISBN Valid'].isnull() & items['Publisher'].isnull()]

Unnamed: 0,Title,Author,ISBN Valid,Publisher,Subjects,i
1345,YYYY Prêt interbibliothèques BPUL,,,,,1345
9460,"4bis-113: Périodique (prêt 28 jours), numéro n...",,,,,9460
11744,Lectures Courantes Extraites des Écrivains Fra...,,,,,11744
14378,Clé USB 8Go,,,,,14378


In [13]:
# Create a list of item IDs to check (from results above)
item_ids_to_check = [1345, 9460, 11744, 14378]

# Filter the interactions DataFrame to get rows with the specified item IDs
filtered_interactions = interactions[interactions['i'].isin(item_ids_to_check)]

# Display the filtered DataFrame
display(filtered_interactions)

Unnamed: 0,u,i,t
775,1993,11744,1706205000.0
8523,1187,9460,1707217000.0
19214,76,1345,1685960000.0
20149,1187,9460,1707217000.0
23493,1623,1345,1693214000.0
24474,1565,1345,1693581000.0
25329,1187,9460,1688753000.0
29207,1187,9460,1688753000.0
29641,1993,11744,1706205000.0
29670,1993,11744,1694622000.0


In [14]:
filtered_interactions.shape

(32, 3)

This means that out of the 87047 interactions, there are 32 instances where a user has interacted with an item we have no data on. This is something we keep in mind going forward.

# OpenAI Embeddings

## Preparing the text for the embeddings

In [2]:
import pandas as pd

In [5]:
merged_items = pd.read_csv('https://raw.githubusercontent.com/olivialaven/MGT502_project/refs/heads/main/data/merged_items.csv')

In [66]:
# Creating a column for the combined data we want to use for the embeddings
merged_items["combined_text_openai"] = merged_items.drop(columns=['i','image_original','image','ISBN Valid','title_long','isbn13','isbn','isbn10','msrp','dewey_decimal','dimensions_structured','publisher']).fillna('').astype(str).apply(' '.join, axis=1)

In [67]:
# Creating a new DataFrame with the book IDs and the text for the embeddings
embeddings_text = merged_items[['i','combined_text_openai']]

In [75]:
#openai.api_key = "sk-

In [77]:
texts = embeddings_text["combined_text_openai"].fillna("").tolist()

In [78]:
book_ids = embeddings_text["i"].tolist()

## Getting the embeddings using OpenAI

In [79]:
def get_openai_embeddings(texts, model="text-embedding-3-small", batch_size=100, sleep=1.0):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        try:
            response = openai.embeddings.create(input=batch, model=model)
            batch_embeddings = [r.embedding for r in response.data]
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"❌ Error at batch {i}: {e}")
            embeddings.extend([[0]*1536 for _ in batch])  # fallback
        time.sleep(sleep)
    return np.array(embeddings)

In [80]:
book_embeddings_openai = get_openai_embeddings(texts)

100%|██████████| 153/153 [04:58<00:00,  1.95s/it]


In [81]:
np.save("book_embeddings_openai.npy", book_embeddings_openai)

In [95]:
# Adding the embeddings to the embeddings_text DataFrame to see it
embeddings_text["embedding"] = list(book_embeddings_openai)
embeddings_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embeddings_text["embedding"] = list(book_embeddings_openai)


Unnamed: 0,i,combined_text_openai,embedding
0,0,Classification décimale universelle : édition ...,"[0.004905377048999071, 0.021680716425180435, 0..."
1,1,Les interactions dans l'enseignement des langu...,"[-0.015924476087093353, 0.019420675933361053, ..."
2,2,Histoire de vie et recherche biographique : pe...,"[0.017170127481222153, 0.04921164736151695, 0...."
3,3,Ce livre devrait me permettre de résoudre le c...,"[0.014691997319459915, 0.05934261158108711, 0...."
4,4,"Les années glorieuses : roman / Lemaitre, Pier...","[-0.0011622064048424363, 0.031185323372483253,..."
...,...,...,...
15286,15286,"Le vagabond de Tokyo / Fukutani, Takashi, 1952...","[0.013858351856470108, 0.022979987785220146, -..."
15287,15287,God of high school : le match contre les dieux...,"[0.00035318153095431626, 0.01434679701924324, ..."
15288,15288,"Blue Lock / Kaneshiro, Muneyuki Pika Compétiti...","[0.02148553542792797, 0.056205108761787415, -0..."
15289,15289,Red eyes sword : akame ga kill ! Zero / Takahi...,"[0.03709163889288902, 0.08017517626285553, 0.0..."
