<a href="https://colab.research.google.com/github/mrbeigh/Recommendation-Engine-for-Customer-Purchases/blob/main/ResoluteAI_Assignment_Recommend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3156224 sha256=e489c8dac3b0e5808eb51321e1c362b3473aa26a1137c0a002c02836bbf6f374
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Dataset, Reader, KNNBasic
from datetime import datetime

# Load data from the locally uploaded Excel file
file_path = '/content/drive/MyDrive/Code Playground/ResoluteAI_Recommender/Online Retail.xlsx'
data = pd.read_excel(file_path, engine='openpyxl')

# Define a custom function to check for non-text descriptions
def is_non_text(description):
    if isinstance(description, str):
        return not description.isalpha()
    return True

# Use the custom function with apply to filter rows
data = data[~data['Description'].apply(is_non_text)]


In [None]:
# Check the format of the CustomerID column
customer_id_format = data['CustomerID'].apply(lambda x: 'Valid' if str(x).isdigit() else 'Invalid')
print(customer_id_format.value_counts())


# Check the format of the InvoiceDate column
invoice_date_format = data['InvoiceDate'].apply(lambda x: 'Valid' if isinstance(x, str) and len(x) >= 10 else 'Invalid')
print(invoice_date_format.value_counts())


Invalid    2491
Name: CustomerID, dtype: int64
Invalid    2491
Name: InvoiceDate, dtype: int64


In [None]:
# Drop rows with missing values in CustomerID and InvoiceDate columns
data.dropna(subset=['InvoiceDate'], inplace=True)

# Format InvoiceDate column as "DD/MM/YY HH:MM"
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], format='%m/%d/%y %H:%M')

# Display the first few rows of the cleaned and formatted data
print(data.head())

     InvoiceNo StockCode Description  Quantity         InvoiceDate  UnitPrice  \
45      536370      POST     POSTAGE         3 2010-12-01 08:45:00       18.0   
141    C536379         D    Discount        -1 2010-12-01 09:41:00       27.5   
386     536403      POST     POSTAGE         1 2010-12-01 11:27:00       15.0   
1123    536527      POST     POSTAGE         1 2010-12-01 13:04:00       18.0   
1423    536540        C2    CARRIAGE         1 2010-12-01 14:05:00       50.0   

      CustomerID         Country  
45       12583.0          France  
141      14527.0  United Kingdom  
386      12791.0     Netherlands  
1123     12662.0         Germany  
1423     14911.0            EIRE  


In [None]:
# Check the format of the CustomerID column
customer_id_format = data['CustomerID'].apply(lambda x: 'Valid' if str(x).isdigit() else 'Invalid')
print(customer_id_format.value_counts())


# Check the format of the InvoiceDate column
invoice_date_format = data['InvoiceDate'].apply(lambda x: 'Valid' if isinstance(x, str) and len(x) >= 10 else 'Invalid')
print(invoice_date_format.value_counts())


Invalid    2491
Name: CustomerID, dtype: int64
Invalid    2491
Name: InvoiceDate, dtype: int64


In [None]:
# Input customer ID and date
customer_id = input("Enter Customer ID: ")
input_date_str = input("Enter Invoice Date (DD/MM/YY HH:MM): ")

# Parse the input date string to a datetime object
try:
    input_date = datetime.strptime(input_date_str, "%d/%m/%y %H:%M")
except ValueError:
    print("Invalid date format. Please use the format DD/MM/YY HH:MM.")
    exit()


Enter Customer ID: 12662
Enter Invoice Date (DD/MM/YY HH:MM): 6/5/11 13:45


In [None]:
# Filter data based on the customer ID and date
filtered_data = data[(data['CustomerID'] == customer_id) & (data['InvoiceDate'] == input_date)]

# Debugging: Print the filtered data for inspection
print("Debug: Filtered Data:")
print(filtered_data)


Debug: Customer ID = 12662
Debug: Input Date = 2011-05-06 13:45:00
Debug: Filtered Data:
Empty DataFrame
Columns: [InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country]
Index: []


In [None]:
# Collaborative Filtering
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(filtered_data[['CustomerID', 'StockCode', 'Quantity']], reader)
trainset = dataset.build_full_trainset()

# Build a recommendation model (user-based collaborative filtering)
sim_options = {'name': 'cosine', 'user_based': True}
collab_model = KNNBasic(sim_options=sim_options)
collab_model.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7bcc1767ba30>

In [None]:
# Content-Based Filtering (based on item descriptions)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Description'].fillna(''))
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [None]:
# Function to get content-based recommendations
def content_based_recommendations(stock_code, cosine_sim=cosine_sim):
    idx = data[data['StockCode'] == stock_code].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Get top 5 similar items (excluding itself)
    item_indices = [i[0] for i in sim_scores]
    return data['StockCode'].iloc[item_indices]


In [None]:
# Example: Recommend items based on a product previously purchased by the customer
if not filtered_data.empty:
    previous_purchase = filtered_data['StockCode'].iloc[0]  # Assuming the customer purchased the first item
    content_based_recommendations(previous_purchase)
else:
    print("No data available for recommendations for the given customer and date.")


No data available for recommendations for the given customer and date.


In [None]:
# Collaborative Filtering Recommendations
def collaborative_filtering_recommendations(customer_id, model=collab_model):
    item_ids = list(data['StockCode'].unique())
    items_to_predict = [(customer_id, item_id, 4.0) for item_id in item_ids if item_id not in filtered_data['StockCode'].unique()]
    predictions = model.test(items_to_predict)
    predictions.sort(key=lambda x: x.est, reverse=True)
    top_recommendations = predictions[:5]  # Adjust the number of recommendations as needed
    return [prediction.iid for prediction in top_recommendations]


In [None]:
# Example: Recommend items to the customer using collaborative filtering
collaborative_filtering_recommendations(customer_id)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


['POST', 'D', 'C2', 'M', 22734]