## Import Libraries


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
product_data = pd.read_csv('Products.csv')
product_data.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [3]:
customer_data = pd.read_csv('Customers.csv')
customer_data.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
transaction_data = pd.read_csv('Transactions.csv')
transaction_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [5]:
# Data Overview
print("Customers Data Info:")
print(customer_data.info())
print("\nProducts Data Info:")
print(product_data.info())
print("\nTransactions Data Info:")
print(transaction_data.info())

Customers Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None

Products Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.2+ KB
None

Transactions Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column    

In [6]:
customer_data.isnull().sum()

CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64

In [7]:
product_data.isnull().sum()

ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64

In [8]:
transaction_data.isnull().sum()

TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64

In [9]:
# Check for duplicates
print("\nDuplicate Records:")
print(f"Customers: {customer_data.duplicated().sum()}, Products: {product_data.duplicated().sum()}, Transactions: {transaction_data.duplicated().sum()}")



Duplicate Records:
Customers: 0, Products: 0, Transactions: 0


In [10]:
# Summary statistics
print("\nSummary Statistics - Transactions:")
print(transaction_data.describe())


Summary Statistics - Transactions:
          Quantity   TotalValue       Price
count  1000.000000  1000.000000  1000.00000
mean      2.537000   689.995560   272.55407
std       1.117981   493.144478   140.73639
min       1.000000    16.080000    16.08000
25%       2.000000   295.295000   147.95000
50%       3.000000   588.880000   299.93000
75%       4.000000  1011.660000   404.40000
max       4.000000  1991.040000   497.76000


In [11]:
# Merge datasets for deeper analysis
df = transaction_data.merge(customer_data, on='CustomerID').merge(product_data, on='ProductID')
df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [13]:
# Aggregate purchase history per customer
df['ProductCategory'] = df['Category'].astype(str)
cust_purchases = df.groupby('CustomerID')['ProductCategory'].apply(lambda x: ' '.join(x)).reset_index()
cust_purchases

Unnamed: 0,CustomerID,ProductCategory
0,C0001,Books Home Decor Electronics Electronics Elect...
1,C0002,Home Decor Home Decor Clothing Clothing
2,C0003,Home Decor Home Decor Clothing Electronics
3,C0004,Books Home Decor Home Decor Home Decor Books B...
4,C0005,Home Decor Electronics Electronics
...,...,...
194,C0196,Books Clothing Home Decor Home Decor
195,C0197,Home Decor Electronics Electronics
196,C0198,Electronics Clothing
197,C0199,Electronics Home Decor Home Decor Electronics


In [16]:
# Convert purchase history into TF-IDF features
vectorizer = TfidfVectorizer()
purchase_matrix = vectorizer.fit_transform(cust_purchases['ProductCategory'])

In [19]:
purchase_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 705 stored elements and shape (199, 5)>

In [20]:
# Compute similarity scores
similarity_matrix = cosine_similarity(purchase_matrix)
cust_ids = cust_purchases['CustomerID'].tolist()

In [21]:
# Generate top 3 similar customers for first 20 customers
lookalike_dict = {}
for i in range(min(20, len(cust_ids))):
    customer_id = cust_ids[i]
    similar_indices = similarity_matrix[i].argsort()[-4:-1][::-1]  # Exclude self
    similar_customers = [(cust_ids[idx], round(similarity_matrix[i][idx], 4)) for idx in similar_indices]
    lookalike_dict[customer_id] = similar_customers

In [22]:
# Save results to CSV
lookalike_df = pd.DataFrame.from_dict(lookalike_dict, orient='index', columns=['SimilarCustomer1', 'SimilarCustomer2', 'SimilarCustomer3'])
lookalike_df.to_csv("Lookalike.csv", index_label='CustomerID')

In [24]:
lookalike_df.head(10)

Unnamed: 0,SimilarCustomer1,SimilarCustomer2,SimilarCustomer3
C0001,"(C0035, 0.9822)","(C0146, 0.9822)","(C0045, 0.9669)"
C0002,"(C0133, 1.0)","(C0002, 1.0)","(C0134, 0.9801)"
C0003,"(C0158, 1.0)","(C0031, 1.0)","(C0003, 1.0)"
C0004,"(C0085, 0.9876)","(C0047, 0.9839)","(C0172, 0.9742)"
C0005,"(C0005, 1.0)","(C0007, 1.0)","(C0127, 0.9661)"
C0006,"(C0006, 1.0)","(C0147, 0.9815)","(C0139, 0.9696)"
C0007,"(C0005, 1.0)","(C0007, 1.0)","(C0127, 0.9661)"
C0008,"(C0154, 0.9862)","(C0181, 0.9862)","(C0039, 0.9862)"
C0009,"(C0040, 1.0)","(C0092, 0.9903)","(C0049, 0.9682)"
C0010,"(C0077, 0.9913)","(C0176, 0.9913)","(C0083, 0.9913)"


In [30]:
# Product Recommendation Function
def recommend_products(customer_id):
    if customer_id not in cust_ids:
        print("Customer ID not found.")
        return
    idx = cust_ids.index(customer_id)
    similar_idx = similarity_matrix[idx].argsort()[-2]  # Get most similar customer
    similar_customer = cust_ids[similar_idx]
    recommended_products = df[df['CustomerID'] == similar_customer]['ProductName'].unique()
    print(f"Recommended products for Customer {customer_id}: {', '.join(recommended_products)}")

# Ask user for customer ID input
user_input = input("Enter Customer ID for product recommendations: ")
recommend_products(user_input)

Recommended products for Customer C0009: ActiveWear Headphones, ActiveWear Jacket
