In [2]:
import pandas as pd
import sqlite3
conn = sqlite3.connect('intex2.db')
cursor = conn.cursor()
read_sql = "SELECT P.ProductId, P.Name, P.Description, CL.Rating, O.CustomerId FROM Products P\
    join CartLine CL on CL.ProductId = P.ProductId join Orders O on O.OrderId = CL.OrderId"
df = pd.read_sql_query(read_sql, conn)

In [2]:
def min_ratings(df, count_column, min=2, messages=True):
  value_counts = df[count_column].value_counts()
  keep_list = value_counts[value_counts >= min]
  df = df.loc[df[count_column].isin(keep_list.index)]

  if messages: print(df[count_column].value_counts())

  return df
df.drop_duplicates(subset=['CustomerId', 'ProductId'], keep='first', inplace=True)
#We are keeping the first because ratings should be based on intial difficulty or enjoyment
df_collab = min_ratings(df, 'ProductId', min=100, messages=False) 
#We set this value low so only new products use collaborative since content isnt very accurate

In [3]:
#this does the collaborative filtering
df_products = df.groupby('ProductId').agg({'Name':'max',
                                       'Description':'max',
                                       'Rating':'count'})
def create_matrix(df, user, item, rating):
  import numpy as np
  from scipy.sparse import csr_matrix

  U = df[user].nunique()  # Number of users for the matrix
  I = df[item].nunique()  # Number of items for the matrix

  # Map user and item IDs to matrix indices
  user_mapper = dict(zip(np.unique(df[user]), list(range(U))))
  item_mapper = dict(zip(np.unique(df[item]), list(range(I))))

  # Map matrix indices back to IDs
  user_inv_mapper = dict(zip(list(range(U)), np.unique(df[user])))
  item_inv_mapper = dict(zip(list(range(I)), np.unique(df[item])))

  # Create a list of index values for the csr_matrix for users and movies
  user_index = [user_mapper[i] for i in df[user]]
  item_index = [item_mapper[i] for i in df[item]]

  # Build the final matrix which will look like: (itemId, userId) rating
  X = csr_matrix((df[rating], (item_index, user_index)), shape=(I, U))

  return X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper

def collab_recommend(itemId, X, item_mapper, item_inv_mapper, k, metric='cosine', messages=True):
  from sklearn.neighbors import NearestNeighbors
  import numpy as np

  rec_ids = []                # Make a list for the recommended item IDs we'll get later
  item = item_mapper[itemId]  # Get the index of the item ID passed into the function
  item_vector = X[item]       # Get the vector of user ratings for the item ID passed into the function

  # Fit the clustering algorithm based on the user-item matrix X
  knn = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric).fit(X)

  # Call the trained knn cluster model to return the nearest neighbors of the item_vector passed in
  rec = knn.kneighbors(item_vector.reshape(1,-1), return_distance=True)
  rec_indeces = rec[1][0]     # Parse out the list of indeces of the recommended items
  rec_distances = rec[0][0]   # Parse out the recommendation strength calculated as the distance from the cluster center
  rec_distances = np.delete(rec_distances, 0) # Drop the first number in the list because it is the distance of itemId from itself

  # We need to replace the recommended item indeces with their original item IDs
  for i in range(1, knn.n_neighbors): # n_neighbors is the number of neighbors to return
    rec_ids.append(item_inv_mapper[rec_indeces[i]])

  # It may help to see what this is. The distance list is first and the recommended item indeces are second
  if messages:
    print(f'List of recommended item indeces:\n{rec_indeces}\n')
    print(f'List of recommended item IDs:\n{rec_ids}\n')
    print(f'List of recommended item similarity to selected item:\n{rec_distances}\n')

  # Return two lists: the original item IDs of the recommendations and their similarity scores
  return rec_ids, rec_distances

X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = create_matrix(df_collab, 'CustomerId', 'ProductId', 'Rating')

In [4]:
#This starts Content filtering
def tfidf_matrix(df, similarity_col):
  import numpy as np
  from sklearn.feature_extraction.text import TfidfVectorizer
  from sklearn.metrics.pairwise import linear_kernel

  # Create a TfidfVectorizer and Remove stopwords
  tfidf = TfidfVectorizer(stop_words='english')

  # Fit and transform the data to a tfidf matrix
  tfidf_matrix = tfidf.fit_transform(df[similarity_col])

  # Build the final matrix which will look like: (movieId, userId) rating
  cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

  return cosine_sim
def content_recommend(item_id, sim_matrix, n=10, messages=True):
  if item_id > sim_matrix.shape[0]:  # Add some error checking for robustness
    print(f"Item {item_id} is not in the similarity matrix you provided with shape: {sim_matrix.shape}")
    return

  # Get the pairwise similarity scores of all movies with that movie
  sim_scores = list(enumerate(sim_matrix[item_id]))

  # Sort the items based on the similarity scores
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  # Get the scores of the n most similar items; start at 1 so that it skips itself
  top_similar = sim_scores[1:n+1]

  # Put the recommended item indices and similarity scores together in a dictionary using comprehension
  rec_dict = {i[0]:i[1] for i in top_similar}

  if messages:
    print(f"The top recommended item IDs are: {list(rec_dict.keys())}")
    print(f"Their similarity scores are:\t  {list(rec_dict.values())}")

  # Return the top n most similar items
  return rec_dict

df_products.reset_index(inplace=True)
df_products['similarity'] = df_products['Name'] + " " + df_products['Description']
sim_matrix = tfidf_matrix(df_products, 'similarity')
df_products.drop(columns=['similarity'], inplace=True)

In [5]:
#This brings them together
threshold = 100

df_recommendations = pd.DataFrame(columns=['RecId', 'Rec1', 'Rec2', 'Rec3', 'Rec4', 'Rec5',
                                           'Rec6', 'Rec7', 'Rec8', 'Rec9', 'Rec10'])

for row in df_products.itertuples():
  if row.Rating >= threshold:
    rec_ids, rec_distances = collab_recommend(row.ProductId, X, item_mapper, item_inv_mapper, k=10, messages=False)
  else:
    recommend_dict = content_recommend(row[0], sim_matrix, n=10, messages=False)
    rec_ids = list(recommend_dict.keys())
    rec_distances = list(recommend_dict.values())

  df_recommendations.loc[row[0]] = [row.ProductId, rec_ids[0], rec_ids[1], rec_ids[2], rec_ids[3], rec_ids[4], rec_ids[5], rec_ids[6], rec_ids[7], rec_ids[8], rec_ids[9]]

df_recommendations.to_sql(name="Recommendations", con=conn, if_exists='replace', index=False)

37

In [3]:
#This determines the cluster to aid recommendations for new users
read_sql = "SELECT * from Customers"
df = pd.read_sql_query(read_sql, conn)
df.drop(columns=['Cluster', 'ClusterRecId'], inplace=True)

In [16]:
import gower
from sklearn.cluster import AgglomerativeClustering
distance_matrix = gower.gower_matrix(df)
agg = AgglomerativeClustering(n_clusters=, linkage="average").fit(distance_matrix)
df['Cluster'] = agg.labels_
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29125,29126,29127,29128,29129,29130,29131,29132,29133,29134
0,0.0,0.906168,0.8913,0.766124,0.597147,0.7609,0.600161,0.596745,0.750653,0.804099,...,0.76733,0.805706,0.799679,0.781394,0.754672,0.915813,0.724332,0.812337,0.754069,0.714688
1,0.906168,0.0,0.729154,0.717099,0.88045,0.859554,0.877436,0.880852,0.726944,0.897931,...,0.718304,0.756681,0.750653,0.732369,0.865783,0.72393,0.753265,0.763311,0.866385,0.763713
2,0.8913,0.729154,0.0,0.731967,0.865582,0.583886,0.862568,0.865984,0.716496,0.769942,...,0.87603,0.771549,0.765521,0.890094,0.863371,0.595941,0.738397,0.77818,0.862769,0.891702
3,0.766124,0.717099,0.731967,0.0,0.740406,0.862367,0.737392,0.740808,0.5869,0.895118,...,0.858348,0.611011,0.604983,0.872413,0.868596,0.721117,0.613221,0.617641,0.869198,0.909383
4,0.597147,0.88045,0.865582,0.740406,0.0,0.735182,0.574442,0.57183,0.724935,0.778381,...,0.741611,0.779988,0.77396,0.755676,0.728953,0.890094,0.729958,0.786618,0.72835,0.740406


In [4]:
import gower
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

# Calculate Gower distance matrix
distance_matrix = gower.gower_matrix(df)

# Silhouette Score
silhouette_scores = []
for n_clusters in range(2, 10):  # Try different numbers of clusters
    agg = AgglomerativeClustering(n_clusters=n_clusters, linkage="average").fit(distance_matrix)
    silhouette_scores.append(silhouette_score(distance_matrix, agg.labels_))

# Plot Silhouette Score
plt.plot(range(2, 10), silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.show()

MemoryError: Unable to allocate 6.32 GiB for an array with shape (29135, 29135) and data type float64

In [None]:
df_products = pd.read_csv(r'"C:\Users\gooch\Downloads\INTEX W24 Dataset.xlsx - Products.csv"')
df_products.to_sql(name="Products", con=conn, if_exists='replace', index=False)

In [5]:
cursor.close()
conn.close()