In [1]:
# Step 1: Import Required Libraries

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
import os

# Check if files exist at the specified paths
print(os.path.exists(r"C:\Users\USER\Downloads\Customers.csv"))  # Should print True if the file exists
print(os.path.exists(r"C:\Users\USER\Downloads\Products.csv"))
print(os.path.exists(r"C:\Users\USER\Downloads\Transactions.csv"))


True
True
True


In [26]:
import pandas as pd

# Step 2: Load the datasets with correct paths

customers = pd.read_csv(r"C:\Users\USER\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\USER\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\USER\Downloads\Transactions.csv")

# step 3: Data Preprocessing

# Print first few rows of each dataset to verify
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [28]:
# Drop rows with missing values (if any)
customers.dropna(inplace=True)
products.dropna(inplace=True)
transactions.dropna(inplace=True)


In [30]:
# Merge customer data with transaction data on CustomerID
merged_data = pd.merge(transactions, customers, on="CustomerID", how="left")

# Check the merged data
print(merged_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price     CustomerName         Region  SignupDate  
0      300.68  300.68   Andrea Jenkins         Europe  2022-12-03  
1      300.68  300.68  Brittany Harvey           Asia  2024-09-04  
2      300.68  300.68  Kathryn Stevens         Europe  2024-04-04  
3      601.36  300.68  Travis Campbell  South America  2024-04-11  
4      902.04  300.68    Timothy Perez         Europe  2022-03-15  


In [32]:
# step 4: Feature Engineering

# Create a customer-product matrix (using product quantity as interaction measure)
transaction_matrix = pd.pivot_table(merged_data, values="Quantity", index="CustomerID", columns="ProductID", aggfunc="sum", fill_value=0)

# Print the first few rows of the matrix
print(transaction_matrix.head())


ProductID   P001  P002  P003  P004  P005  P006  P007  P008  P009  P010  ...  \
CustomerID                                                              ...   
C0001          0     0     0     0     0     0     0     0     0     0  ...   
C0002          0     0     0     4     0     0     0     0     0     0  ...   
C0003          0     4     0     0     0     3     0     0     0     0  ...   
C0004          0     0     0     0     0     0     0     2     0     0  ...   
C0005          0     0     0     0     0     0     0     0     0     0  ...   

ProductID   P091  P092  P093  P094  P095  P096  P097  P098  P099  P100  
CustomerID                                                              
C0001          0     0     0     0     0     2     0     0     0     0  
C0002          0     0     0     0     2     0     0     0     0     0  
C0003          0     0     0     0     0     0     0     0     0     0  
C0004          0     0     0     0     0     0     3     0     0     0  
C0005   

In [34]:
# step 5: Model Development: Compute Similarity

from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity between customers
cosine_sim = cosine_similarity(transaction_matrix)

# Convert the cosine similarity matrix into a DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim, index=transaction_matrix.index, columns=transaction_matrix.index)

# Check the similarity matrix
print(cosine_sim_df.head())


CustomerID  C0001  C0002     C0003     C0004     C0005  C0006     C0007  \
CustomerID                                                                
C0001         1.0    0.0  0.000000  0.000000  0.000000    0.0  0.214834   
C0002         0.0    1.0  0.000000  0.000000  0.000000    0.0  0.000000   
C0003         0.0    0.0  1.000000  0.097980  0.308697    0.0  0.000000   
C0004         0.0    0.0  0.097980  1.000000  0.168034    0.0  0.000000   
C0005         0.0    0.0  0.308697  0.168034  1.000000    0.0  0.000000   

CustomerID     C0008  C0009     C0010  ...     C0191     C0192  C0193  \
CustomerID                             ...                              
C0001       0.000000    0.0  0.000000  ...  0.061721  0.000000    0.0   
C0002       0.262071    0.0  0.000000  ...  0.000000  0.000000    0.0   
C0003       0.313786    0.0  0.000000  ...  0.000000  0.000000    0.0   
C0004       0.048038    0.0  0.149854  ...  0.000000  0.000000    0.0   
C0005       0.000000    0.0  0.00000

In [43]:
print(cosine_sim_df.index)  # Print out the indices (customer IDs) in the similarity matrix


Index(['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007', 'C0008',
       'C0009', 'C0010',
       ...
       'C0191', 'C0192', 'C0193', 'C0194', 'C0195', 'C0196', 'C0197', 'C0198',
       'C0199', 'C0200'],
      dtype='object', name='CustomerID', length=199)


In [67]:
# Output the list of customer IDs in the DataFrame index
print(cosine_sim_df.index.tolist())

['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007', 'C0008', 'C0009', 'C0010', 'C0011', 'C0012', 'C0013', 'C0014', 'C0015', 'C0016', 'C0017', 'C0018', 'C0019', 'C0020', 'C0021', 'C0022', 'C0023', 'C0024', 'C0025', 'C0026', 'C0027', 'C0028', 'C0029', 'C0030', 'C0031', 'C0032', 'C0033', 'C0034', 'C0035', 'C0036', 'C0037', 'C0038', 'C0039', 'C0040', 'C0041', 'C0042', 'C0043', 'C0044', 'C0045', 'C0046', 'C0047', 'C0048', 'C0049', 'C0050', 'C0051', 'C0052', 'C0053', 'C0054', 'C0055', 'C0056', 'C0057', 'C0058', 'C0059', 'C0060', 'C0061', 'C0062', 'C0063', 'C0064', 'C0065', 'C0066', 'C0067', 'C0068', 'C0069', 'C0070', 'C0071', 'C0072', 'C0073', 'C0074', 'C0075', 'C0076', 'C0077', 'C0078', 'C0079', 'C0080', 'C0081', 'C0082', 'C0083', 'C0084', 'C0085', 'C0086', 'C0087', 'C0088', 'C0089', 'C0090', 'C0091', 'C0092', 'C0093', 'C0094', 'C0095', 'C0096', 'C0097', 'C0098', 'C0099', 'C0100', 'C0101', 'C0102', 'C0103', 'C0104', 'C0105', 'C0106', 'C0107', 'C0108', 'C0109', 'C0110', 'C0111', 

In [75]:
customer_id = 'C0155'  # Replace with a valid ID from the list
top_lookalikes = get_top_lookalikes(customer_id)
print(top_lookalikes)


CustomerID
C0042    0.436436
C0162    0.400892
C0130    0.377964
Name: C0155, dtype: float64


In [81]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(transaction_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=transaction_matrix.index, columns=transaction_matrix.index)

def get_top_lookalikes(customer_id, top_n=3):
    sim_scores = cosine_sim_df.loc[customer_id]
    similar_customers = sim_scores.drop(customer_id).sort_values(ascending=False)
    return similar_customers.head(top_n)
customer_id = 'C0155'  # Example customer ID
top_lookalikes = get_top_lookalikes(customer_id)
print(top_lookalikes)          

CustomerID
C0042    0.436436
C0162    0.400892
C0130    0.377964
Name: C0155, dtype: float64


In [93]:
 # step 6: Generate Lookalike Recommendations

lookalikes = {}

# For each customer (C0001 to C0020), find top 3 lookalikes
for cust_id in ['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007', 'C0008', 'C0009', 'C0010',
                'C0011', 'C0012', 'C0013', 'C0014', 'C0015', 'C0016', 'C0017', 'C0018', 'C0019', 'C0020']:
    # Get similarity scores for the customer and sort them in descending order
    similar_scores = cosine_sim_df[cust_id].sort_values(ascending=False).iloc[1:4]  # Exclude the customer itself
    lookalikes[cust_id] = list(zip(similar_scores.index, similar_scores.values))

# Convert lookalikes to a DataFrame
lookalike_df = pd.DataFrame(list(lookalikes.items()), columns=['cust_id', 'lookalikes'])

# Save the results to Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)


In [97]:
# Step 7: Output Format of Lookalike.csv:

lookalikes = {
    'C0001': [('C0005', 0.95), ('C0002', 0.92), ('C0003', 0.91)],
    'C0002': [('C0001', 0.92), ('C0004', 0.89), ('C0003', 0.88)],
    'C0003': [('C0001', 0.91), ('C0005', 0.87), ('C0004', 0.85)],
}

# Print the lookalikes dictionary to check
print(lookalikes)


{'C0001': [('C0005', 0.95), ('C0002', 0.92), ('C0003', 0.91)], 'C0002': [('C0001', 0.92), ('C0004', 0.89), ('C0003', 0.88)], 'C0003': [('C0001', 0.91), ('C0005', 0.87), ('C0004', 0.85)]}
