In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Display basic info
print(customers.head())
print(transactions.head())
print(products.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  
  ProductID              ProductName     Category   Price
0      P001

In [7]:
# Merge transactions with customers
transactions_customers = transactions.merge(customers, on='CustomerID')

# Merge the above with products
merged_data = transactions_customers.merge(products, on='ProductID')

# Display the merged data
print(merged_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [9]:
# Aggregate customer transaction data
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',    # Total quantity purchased
    'ProductID': 'nunique',  # Number of unique products bought
    'Category': lambda x: x.mode()[0]  # Most purchased category
}).reset_index()

# Add customer profile details
customer_features = customer_features.merge(customers, on='CustomerID')

# Display the aggregated features
print(customer_features.head())


  CustomerID  TotalValue  Quantity  ProductID     Category  \
0      C0001     3354.52        12          5  Electronics   
1      C0002     1862.74        10          4     Clothing   
2      C0003     2725.38        14          4   Home Decor   
3      C0004     5354.88        23          8        Books   
4      C0005     2034.24         7          3  Electronics   

         CustomerName         Region  SignupDate  
0    Lawrence Carroll  South America  2022-07-10  
1      Elizabeth Lutz           Asia  2022-02-13  
2      Michael Rivera  South America  2024-03-07  
3  Kathleen Rodriguez  South America  2022-10-09  
4         Laura Weber           Asia  2022-08-15  


In [11]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical features (e.g., Region, Category)
encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse
encoded_region_category = encoder.fit_transform(customer_features[['Region', 'Category']])

# Combine encoded features with numerical ones
numerical_features = customer_features[['TotalValue', 'Quantity', 'ProductID']].values
features = np.hstack((numerical_features, encoded_region_category))

# Standardize the numerical features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Display the scaled feature matrix
print(features_scaled[:5])


[[-0.06170143 -0.12203296  0.05004655 -0.53279543 -0.57928445 -0.54831888
   1.54041597 -0.71244351 -0.54056248  1.84992492 -0.51721942]
 [-0.87774353 -0.44800021 -0.42420409  1.87689298 -0.57928445 -0.54831888
  -0.6491753  -0.71244351  1.84992492 -0.54056248 -0.51721942]
 [-0.40585722  0.20393428 -0.42420409 -0.53279543 -0.57928445 -0.54831888
   1.54041597 -0.71244351 -0.54056248 -0.54056248  1.93341543]
 [ 1.03254704  1.67078689  1.47279848 -0.53279543 -0.57928445 -0.54831888
   1.54041597  1.40362005 -0.54056248 -0.54056248 -0.51721942]
 [-0.78392861 -0.93695108 -0.89845473  1.87689298 -0.57928445 -0.54831888
  -0.6491753  -0.71244351 -0.54056248  1.84992492 -0.51721942]]


In [13]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(features_scaled)

# Convert similarity matrix to a dataframe for easier manipulation
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Display the similarity matrix
print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.280048  0.254230  0.170413  0.344343  0.255980   
C0002      -0.280048  1.000000 -0.227233 -0.456337  0.449392 -0.360495   
C0003       0.254230 -0.227233  1.000000  0.115527 -0.209005  0.250921   
C0004       0.170413 -0.456337  0.115527  1.000000 -0.553407  0.644247   
C0005       0.344343  0.449392 -0.209005 -0.553407  1.000000 -0.306875   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.350567 -0.230811 -0.231849 -0.299177  ...  0.272573  0.935567   
C0002       0.430970 -0.336212  0.466634  0.433026  ... -0.308954 -0.144032   
C0003      -0.224532  0.208759 -0.183540 -0.229427  ...  0.260536  0.274864   
C0004      -0.519245  0.224978 -0.640452 -0.433110  ...  0.624598 -0.080948   
C0005  

In [15]:
# Function to get top 3 similar customers for each customer
def get_top_3_similar(similarity_df, customer_id):
    # Sort customers by similarity score (descending) and exclude the customer themselves
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:4]
    return similar_customers.index.tolist(), similar_customers.values.tolist()

# Create a dictionary to store lookalike customers and their similarity scores
lookalike_dict = {}

for customer_id in customer_features['CustomerID']:
    similar_ids, scores = get_top_3_similar(similarity_df, customer_id)
    lookalike_dict[customer_id] = list(zip(similar_ids, scores))

# Display lookalike results for the first few customers
for customer_id, lookalikes in list(lookalike_dict.items())[:5]:
    print(f"Customer {customer_id}: {lookalikes}")


Customer C0001: [('C0048', 0.995290637262074), ('C0190', 0.9906194089322833), ('C0184', 0.9844187291016829)]
Customer C0002: [('C0088', 0.98787179707641), ('C0092', 0.9694148762001229), ('C0106', 0.9596061001930802)]
Customer C0003: [('C0076', 0.9701860830785703), ('C0031', 0.9677597585275075), ('C0052', 0.9640448202774735)]
Customer C0004: [('C0087', 0.9706667188314319), ('C0155', 0.9565486538771887), ('C0169', 0.9527341948287605)]
Customer C0005: [('C0186', 0.9990949206275932), ('C0007', 0.9952048431090786), ('C0140', 0.9893348231608935)]


In [17]:
# Filter the first 20 customers (C0001 to C0020)
lookalike_results = []

for customer_id in customer_features['CustomerID'][:20]:
    lookalikes = lookalike_dict[customer_id]
    for similar_id, score in lookalikes:
        lookalike_results.append({'cust_id': customer_id, 'lookalike_cust_id': similar_id, 'score': score})

# Convert results to a DataFrame
lookalike_df = pd.DataFrame(lookalike_results)

# Save to CSV
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

# Display the saved results
print(lookalike_df.head())


  cust_id lookalike_cust_id     score
0   C0001             C0048  0.995291
1   C0001             C0190  0.990619
2   C0001             C0184  0.984419
3   C0002             C0088  0.987872
4   C0002             C0092  0.969415
