**Walmart Recommendation System**

---
scope of the project:-

> Providing effective recommendation system for end user.


*   Data:- Kaggle
*   Technique:- Content Base, Collaborative Filtering and Hybrid Recommendations
*   Final data:- Export into .csv and json format
*   Dump:- Result of technique dumped into .pickle format

#Packages and Libraries

In [None]:
import pandas as pd
import spacy
from surprise import Dataset, Reader, SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from spacy.lang.en.stop_words import STOP_WORDS
#pip install scikit-surprise

#Data Loading And Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Ml-Projects/recommendation-system/walmart_com_product_review_data.tsv", sep='\t')
data.head(2)

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
walmart_df = data[['Uniq Id','Product Id', 'Product Rating', 'Product Reviews Count', 'Product Category', 'Product Brand', 'Product Name', 'Product Image Url', 'Product Description', 'Product Price', 'Product Tags']]
walmart_df.head(2)

## Data Cleaning And Tags Creations

In [None]:
walmart_df.shape

In [None]:
walmart_df.isnull().sum()

In [None]:
walmart_df['Product Price'].describe()

In [None]:
walmart_df['Product Price'].mean().round(2).astype(int)

In [None]:
#productPrice = walmart_df['Product Price'].mean().astype(int)
productPrice = walmart_df['Product Price'].mean().round(2).astype(int)
walmart_df.fillna({'Product Rating': 0, 'Product Reviews Count': 0, 'Product Category': '', 'Product Brand':'', 'Product Description': ''}, inplace=True)
walmart_df.fillna({'Product Price' : productPrice}, inplace=True)

walmart_df.isnull().sum()

In [None]:
walmart_df['Product Tags'] = ''
walmart_df.head(1)

In [None]:
walmart_df.duplicated().sum()

In [None]:
walmart_df['Uniq Id'] = walmart_df['Uniq Id'].str.extract(r'(\d+)').astype(float)
walmart_df['Uniq Id'] = walmart_df['Uniq Id'].astype(int)
walmart_df['Product Id'] = walmart_df['Product Id'].str.extract(r'(\d+)').astype(float)
walmart_df['Product Id'] = walmart_df['Product Id'].astype(int)
walmart_df['Product Reviews Count'] = walmart_df['Product Reviews Count'].astype(int)
walmart_df.head(2)

In [None]:
num_users = walmart_df['Uniq Id'].nunique()
num_items = walmart_df['Product Id'].nunique()
num_ratings = walmart_df['Product Rating'].nunique()
print(f"Number of unique users: {num_users}")
print(f"Number of unique items: {num_items}")
print(f"Number of unique ratings: {num_ratings}")

In [None]:
walmart_df.columns

In [None]:
tages_df = pd.DataFrame()
columns_to_extract_tags_from = ['Product Category', 'Product Brand', 'Product Description']
spacyNLP = spacy.load("en_core_web_sm")
def clean_and_extract_tags(text):
    words = spacyNLP(text.lower())
    tags = [token.text for token in words if token.text.isalnum() and token.text not in STOP_WORDS]
    return ', '.join(tags)



for column in columns_to_extract_tags_from:
    tages_df[column] = walmart_df[column].apply(clean_and_extract_tags)


In [None]:
walmart_df['Product Tags'].head(2)

In [None]:
tages_df.head(2)

In [None]:
walmart_df['Product Tags'] = tages_df[columns_to_extract_tags_from].apply(lambda row: ', '.join(row), axis=1)

In [None]:
walmart_df['Product Tags'].head(2)

In [None]:
for df_index in walmart_df.index:
  walmart_df._set_value(df_index, 'Product Image Url', walmart_df._get_value(df_index, 'Product Image Url').split("|")[0])



In [None]:
walmart_df['Product Image Url'].head(2)

#Content Base Recommendation system (User Preferences or Items similarities)

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer_matrix = tfidf_vectorizer.fit_transform(walmart_df['Product Tags'])
cosine_similarities = cosine_similarity(tfidf_vectorizer_matrix, tfidf_vectorizer_matrix)
linear_kernel_similarities = linear_kernel(tfidf_vectorizer_matrix, tfidf_vectorizer_matrix)

In [None]:
linear_kernel_similarities

In [None]:
item_name = 'OPI Infinite Shine, Nail Lacquer Nail Polish, Bubble Bath'
item_name2='Kokie Professional Matte Lipstick, Hot Berry, 0.14 fl oz'
top_n=10

# Function To Recommend Products For Content Base Using  cosine_similarity

In [None]:
def get_content_based_cosine_similarity_recommendationsDF(content, top_n=10):
    if content not in walmart_df['Product Name'].values:
        print(f"Item '{content}' not found in the training data.")
        return pd.DataFrame() # Return an empty DataFrame if no match is found


    index = walmart_df[walmart_df['Product Name'] == content].index[0]
    similarity_scores = list(enumerate(cosine_similarities[index]))
    similar_items = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    similar_indices = [x[0] for x in similar_items]
    recommendations = walmart_df.iloc[similar_indices][['Product Id','Product Name', 'Product Brand', 'Product Description', 'Product Category', 'Product Image Url', 'Product Price', 'Product Rating', 'Product Tags', 'Product Reviews Count']]

    return recommendations


In [None]:
all_recommendations = []

for product_name in walmart_df['Product Name']:
    recommendations = get_content_based_cosine_similarity_recommendationsDF(product_name, top_n=10)

    if not recommendations.empty:
        recommendations['Source Product'] = product_name  # Add the original product for reference
        all_recommendations.append(recommendations)

# Combine all results into a single DataFrame
similarity_recommendationsDF = pd.concat(all_recommendations, ignore_index=True)

In [None]:
similarity_recommendationsDF.head(10)

In [None]:
def get_content_based_cosine_similarity_recommendations(content, top_n=10):
  if content not in walmart_df['Product Name'].values:
    print(f"Item '{content}' not found in the training data.")
    return pd.DataFrame() # Return an empty DataFrame if no match is found

    index = walmart_df[walmart_df['Product Name'] == content].index[0]
    similarity_scores = list(enumerate(cosine_similarities[index]))
    similar_items = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    similar_indices = [x[0] for x in similar_items]
    recommendations = walmart_df.iloc[similar_indices][['Product Id','Product Name', 'Product Brand', 'Product Image Url', 'Product Price', 'Product Rating', 'Product Reviews Count']]

    return recommendations

In [None]:
get_content_based_cosine_similarity_recommendations(item_name, top_n)

# Function To Recommend Products for Content Base Using  linear_kernel

In [None]:
def get_content_based_linear_kernel_recommendations(content, top_n=10):
    if content not in walmart_df['Product Name'].values:
        print(f"Item '{content}' not found in the training data.")
        return pd.DataFrame()


    index = walmart_df[walmart_df['Product Name'] == content].index[0]
    similarity_scores = linear_kernel_similarities[index]
    similar_indices = similarity_scores.argsort()[::-1][1:top_n + 1]
    #recommendationsm = list(enumerate(walmart_df.loc[similar_indices, 'Product Id'].values))
    recommendations = walmart_df.iloc[similar_indices][['Product Id','Product Name', 'Product Brand', 'Product Image Url','Product Price', 'Product Rating', 'Product Reviews Count']]

    return recommendations

In [None]:
get_content_based_linear_kernel_recommendations(item_name2, top_n)

#Converting Data Into Final Sheets

In [None]:
walmart_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Ml-Projects/recommendation-system/walmart_final.csv")
similarity_recommendationsDF.to_json("/content/drive/MyDrive/Colab Notebooks/Ml-Projects/recommendation-system/recommendations.json", orient="records")
#similarity_recommendationsDF.to_csv("/content/drive/MyDrive/Colab Notebooks/Ml-Projects/recommendation-system/walmart_final_similarity_recommendations.cvs")
#walmart_df.to_excel("/content/drive/MyDrive/Colab Notebooks/Ml-Projects/recommendation-system/walmart_final.xlsx")

# Collaborative Filtering

In [None]:
collaborativeFiltering_df = walmart_df
reader = Reader(rating_scale=(1, 5))
collaborativeFiltering_df = Dataset.load_from_df(collaborativeFiltering_df[['Uniq Id',
                                  'Product Id',
                                  'Product Rating']], reader)

In [None]:
algo = SVD()
trainset =  collaborativeFiltering_df.build_full_trainset()
algo.fit(trainset)

def get_collaborative_filtering_recommendations(user_id, top_n):
    testset = trainset.build_anti_testset()
    testset = filter(lambda x: x[0] == user_id, testset)
    predictions = algo.test(testset)
    predictions.sort(key=lambda x: x.est, reverse=True)
    similar_indices = [prediction.iid for prediction in predictions[:top_n]]
    recommendations = walmart_df.iloc[similar_indices][['Product Id','Product Name', 'Product Brand', 'Product Image Url', 'Product Rating', 'Product Reviews Count']]
    return recommendations

In [None]:
user_id =70
get_collaborative_filtering_recommendations(user_id, 2)

# Hybrid Approach

In [None]:
def get_hybrid_recommendations(user_id, content, top_n):
    content_based_recommendations = get_content_based_linear_kernel_recommendations(content, top_n)
    collaborative_filtering_recommendations = get_collaborative_filtering_recommendations(user_id, top_n)
    hybrid_recommendations = pd.concat([content_based_recommendations,collaborative_filtering_recommendations]).drop_duplicates()

    return hybrid_recommendations[:top_n]



In [None]:
user_id = 70
item_name = 'OPI Infinite Shine, Nail Lacquer Nail Polish, Bubble Bath'
item_name2='Kokie Professional Matte Lipstick, Hot Berry, 0.14 fl oz'
top_n=2

recommendations = get_hybrid_recommendations(user_id, item_name, top_n)

print(f"Hybrid Recommendations for User {user_id} based on Product {item_name2}:")
recommendations
#for i, recommendation in enumerate(recommendations):
#    print(f"{i + 1}. Product ID: {recommendation}")

# Split DatFrame In Train And Test

In [None]:
#walmartTrainTest_df = walmart_df
#walmartTrainTest_df.head(2)

In [None]:
#x = walmartTrainTest_df.drop(['Product Tags'], axis=1)
#y = walmartTrainTest_df['Product Tags']

In [None]:
#from sklearn.model_selection import train_test_split

# Save Model

In [None]:
import pickle
pickle.dump(similarity_recommendationsDF, open('/content/drive/MyDrive/Colab Notebooks/Ml-Projects/recommendation-system/walmart_cosine_similarities.pickle', 'wb'))
pickle.dump(linear_kernel_similarities, open('/content/drive/MyDrive/Colab Notebooks/Ml-Projects/recommendation-system/walmart_linear_kernel_similarities.pickle', 'wb'))
