In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\giris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
processed_path = r'C:\\Users\\giris\\OneDrive\\Documents\\GitHub\\Recommander-system\Data\\Processed\\cleaned_amazon.csv'
df = pd.read_csv(processed_path)

In [6]:
def preprocess_text(text):
    if isinstance(text, float) and pd.isna(text):
        return ""

    text = text.lower()

    words = text.split()

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    return " ".join(stemmed_words)

In [7]:
df['combined_text'] = df['product_name'].fillna('') + ' ' + df['about_product'].fillna('')

df['preprocessed_text'] = df['combined_text'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_text'])

product_features = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
product_features.index = df['product_id']

category_df = pd.get_dummies(df['category'], prefix='category')
category_df.index = df['product_id']
product_features = product_features.merge(category_df, left_index=True, right_index=True)


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim_matrix = cosine_similarity(product_features)

In [9]:
processed_path = r'C:\Users\giris\OneDrive\Documents\GitHub\Recommander-system\Data\Processed'

product_features.to_csv(f'{processed_path}/product_features_v2.csv', index=False)

np.save(f'{processed_path}/cosine_sim_matrix_v2.npy', cosine_sim_matrix)

print("Updated product features and similarity matrix saved.")

Updated product features and similarity matrix saved.
