In [23]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the Dataset

In [24]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Merge Datasets

In [25]:
# Merge datasets to create a unified dataset
merged_df = transactions.merge(products, on='ProductID', how='left').merge(customers, on='CustomerID', how='left')
merged_df

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,SoundWave Smartwatch,Electronics,459.86,Jacob Holt,South America,2022-01-22
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,SoundWave Smartwatch,Electronics,459.86,Mrs. Kimberly Wright,North America,2024-04-07
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,SoundWave Smartwatch,Electronics,459.86,Tyler Haynes,North America,2024-09-21
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,SoundWave Smartwatch,Electronics,459.86,Joshua Hamilton,Asia,2024-11-11


# Create Transaction History

In [26]:
# Group by CustomerID to create aggregated transaction-based features
transaction_history = merged_df.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'Category': 'nunique'  # Count of unique categories purchased
}).reset_index()

# Display the transaction history
print(transaction_history.head())


  CustomerID  Quantity  TotalValue  Category
0      C0001        12     3354.52         3
1      C0002        10     1862.74         2
2      C0003        14     2725.38         3
3      C0004        23     5354.88         3
4      C0005         7     2034.24         2


# Normalize Features

In [27]:
# Normalize numerical features for better similarity scoring
scaler = StandardScaler()
transaction_history[['Quantity', 'TotalValue', 'Category']] = scaler.fit_transform(
    transaction_history[['Quantity', 'TotalValue', 'Category']]
)

# Display normalized data
print(transaction_history.head())


  CustomerID  Quantity  TotalValue  Category
0      C0001 -0.122033   -0.061701  0.160540
1      C0002 -0.448000   -0.877744 -0.904377
2      C0003  0.203934   -0.405857  0.160540
3      C0004  1.670787    1.032547  0.160540
4      C0005 -0.936951   -0.783929 -0.904377


# Compute Cosine Similarity

In [28]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(transaction_history.drop('CustomerID', axis=1))
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=transaction_history['CustomerID'],
    columns=transaction_history['CustomerID']
)

# Display similarity matrix for verification
print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.128906  0.255218 -0.581904  0.054656  0.153945   
C0002      -0.128906  1.000000  0.185745 -0.682900  0.947223 -0.736899   
C0003       0.255218  0.185745  1.000000 -0.055367 -0.024722 -0.754068   
C0004      -0.581904 -0.682900 -0.055367  1.000000 -0.841335  0.275903   
C0005       0.054656  0.947223 -0.024722 -0.841335  1.000000 -0.507986   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001      -0.076667  0.068403  0.278524 -0.255435  ... -0.456474  0.722137   
C0002       0.925146 -0.860518  0.895945  0.968247  ...  0.891149  0.592890   
C0003      -0.171144  0.305963  0.091381  0.342796  ... -0.209427  0.335503   
C0004      -0.765629  0.757703 -0.935443 -0.513171  ... -0.457921 -0.949299   
C0005  

# Find Top 3 Lookalikes for the First 20 Customers

In [29]:
# Find the top 3 lookalike customers for each of the first 20 customers
lookalikes = {}
for customer_id in customers['CustomerID'][:20]:  # Assuming the file has sorted customer IDs
    if customer_id in similarity_df.index:
        similarity_scores = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  # Top 3 excluding self
        lookalikes[customer_id] = list(zip(similarity_scores.index.tolist(), similarity_scores.values.tolist()))


# Create Lookalike DataFrame

In [30]:
# Convert lookalike dictionary to a structured DataFrame
lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=['Top_1', 'Top_2', 'Top_3'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index_label='CustomerID')

# Display the lookalike DataFrame
print(lookalike_df.head())


                             Top_1                        Top_2  \
C0001  (C0164, 0.9483103972840492)   (C0056, 0.935848588429012)   
C0002   (C0073, 0.997928748038141)  (C0197, 0.9928926281197697)   
C0003  (C0047, 0.9447404341219031)  (C0106, 0.9435035461271949)   
C0004  (C0165, 0.9994054329613362)  (C0084, 0.9907744001327368)   
C0005   (C0131, 0.999624303410505)  (C0058, 0.9996081115000927)   

                             Top_3  
C0001  (C0127, 0.9240311145070983)  
C0002  (C0199, 0.9917489606462391)  
C0003   (C0030, 0.934017142617637)  
C0004  (C0018, 0.9877511817104291)  
C0005   (C0097, 0.998687970720145)  


# Generate Lookalike Map

In [31]:
# Create the map structure
lookalike_map = {
    customer: [{'cust_id': cust_id, 'score': score} for cust_id, score in data]
    for customer, data in lookalikes.items()
}

# Display the map
print("Lookalike Map:")
print(lookalike_map)


Lookalike Map:
{'C0001': [{'cust_id': 'C0164', 'score': 0.9483103972840492}, {'cust_id': 'C0056', 'score': 0.935848588429012}, {'cust_id': 'C0127', 'score': 0.9240311145070983}], 'C0002': [{'cust_id': 'C0073', 'score': 0.997928748038141}, {'cust_id': 'C0197', 'score': 0.9928926281197697}, {'cust_id': 'C0199', 'score': 0.9917489606462391}], 'C0003': [{'cust_id': 'C0047', 'score': 0.9447404341219031}, {'cust_id': 'C0106', 'score': 0.9435035461271949}, {'cust_id': 'C0030', 'score': 0.934017142617637}], 'C0004': [{'cust_id': 'C0165', 'score': 0.9994054329613362}, {'cust_id': 'C0084', 'score': 0.9907744001327368}, {'cust_id': 'C0018', 'score': 0.9877511817104291}], 'C0005': [{'cust_id': 'C0131', 'score': 0.999624303410505}, {'cust_id': 'C0058', 'score': 0.9996081115000927}, {'cust_id': 'C0097', 'score': 0.998687970720145}], 'C0006': [{'cust_id': 'C0079', 'score': 0.9999270505443565}, {'cust_id': 'C0187', 'score': 0.9829896907918099}, {'cust_id': 'C0196', 'score': 0.9774255533167279}], 'C000