In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import seaborn as sns
import os
#import time
import re
import string

from PIL import Image

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Parisa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Parisa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Parisa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

We only need to load articles data in this part

In [64]:
#Loading Data
articles_df = pd.read_pickle('../data/processed/cleaned_articles.pkl')


## Preprocessing Data

For our Content Base Recommnedation system, we will not be using the majority of the columns in the articles_df. So, we can proceed by dropping the columns that are unnecessary for our recommendation systems and export the csv for later use.



In [65]:
articles_subset_df = articles_df[[
    'product_type_name',
    'index_group_name',
    'section_name',
    'product_group_name',
    'garment_group_name',
    'colour_group_name',
    'preprocessed_desc'
    ]]


In [66]:
articles_subset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105107 entries, 0 to 105106
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   product_type_name   105107 non-null  object
 1   index_group_name    105107 non-null  object
 2   section_name        105107 non-null  object
 3   product_group_name  105107 non-null  object
 4   garment_group_name  105107 non-null  object
 5   colour_group_name   105107 non-null  object
 6   preprocessed_desc   105107 non-null  object
dtypes: object(7)
memory usage: 6.4+ MB


When suggesting similar products, Color is an important factor. To improve the quality of our text features, we’re adding the colour_group_name to the detail_desc column. This helps capture more complete information, as the original descriptions don’t include color.

In [67]:
# Add colour_group_name to detail_desc
articles_subset_df.loc[:, 'preprocessed_desc'] = articles_subset_df.apply(
    lambda row: f"{row['preprocessed_desc']} Color: {row['colour_group_name']}", axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_subset_df.loc[:, 'preprocessed_desc'] = articles_subset_df.apply(


In [68]:
articles_subset_df['preprocessed_desc'].head(2)

0    jersey top narrow shoulder strap Color: Black
1    jersey top narrow shoulder strap Color: White
Name: preprocessed_desc, dtype: object

## Modelling

### Encode Categorical Features¶

Well here we will encode usefull categorical columns using One_Hot Encoding

In [69]:
# categorical_cols = [
#     'product_type_name',
#     'index_group_name',
#     'section_name',
#     'product_group_name',
#     'garment_group_name',
#     'colour_group_name',
#     'perceived_colour_value_name'
# ]

# articles_encoded = pd.get_dummies(articles_df[categorical_cols], prefix=categorical_cols)
# articles_df = pd.concat([articles_df, articles_encoded], axis=1)

    

In [70]:
# articles_df.shape

### Vectorize preprocessed_desc

#### TF-IDF:

Here I turn the cleaned description column into numerical vectors, using ***TF-IDF***

In [71]:


from sklearn.feature_extraction.text import TfidfVectorizer

#Define a Tfid Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2))

#Fit and Transform our description with Tfid Vectorizer
desc_tfidf = tfidf_vectorizer.fit_transform(articles_df['preprocessed_desc'])



### KNN to Find Similar Articles

In [None]:
from sklearn.neighbors import NearestNeighbors

# Initialize model — use cosine distance
knn = NearestNeighbors(n_neighbors=6, metric='cosine', algorithm='brute')
knn.fit(desc_tfidf)

# Get top-5 similar items for each article
distances, indices = knn.kneighbors(desc_tfidf)

In [None]:
article_ids = articles_df['article_id'].values

# Map indices to article IDs
similar_articles = {
    article_ids[i]: article_ids[indices[i][1:]]  
    for i in range(len(article_ids))
}


In [None]:
# article_id = 108775015
# similar_ids = similar_articles[article_id]
# articles_df[articles_df['article_id'].isin(similar_ids)][['article_id', 'product_type_name', 'preprocessed_desc']]


In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt

def get_image_path(article_id, image_base_path):
    article_id_str = str(article_id).zfill(10)
    folder = article_id_str[:3]
    filename = f"{article_id_str}.jpg"
    return os.path.join(image_base_path, folder, filename)


In [None]:
def show_similar_articles(selected_article_id, similar_articles, image_base_path):
    similar_ids = similar_articles[selected_article_id]
    all_ids = [selected_article_id] + list(similar_ids)
    
    plt.figure(figsize=(15, 4))
    
    for i, article_id in enumerate(all_ids):
        img_path = get_image_path(article_id, image_base_path)
        try:
            img = Image.open(img_path)
            plt.subplot(1, len(all_ids), i + 1)
            plt.imshow(img)
            plt.axis('off')
            title = "Query" if i == 0 else f"Similar {i}"
            plt.title(title, fontsize=10)
        except FileNotFoundError:
            print(f"Image not found for article {article_id}")
    
    plt.tight_layout()
    plt.show()


In [None]:
# Choose a sample article ID from your dataset
sample_article_id = 108775015

# Show similar articles based on your earlier similarity dict
show_similar_articles(sample_article_id, similar_articles, image_base_path='../data/images')
