In [18]:
import pandas as pd

# Load the datasets into pandas dataframes
customers = pd.read_csv('Customers.csv')  # Load the Customers.csv file
products = pd.read_csv('Products.csv')    # Load the Products.csv file
transactions = pd.read_csv('Transactions.csv')  # Load the Transactions.csv file

# Display the first few rows of each dataframe to check the data
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region SignupDate
0      C0001    Lawrence Carroll  South America  7-10-2022
1      C0002      Elizabeth Lutz           Asia  2-13-2022
2      C0003      Michael Rivera  South America   3-7-2024
3      C0004  Kathleen Rodriguez  South America  10-9-2022
4      C0005         Laura Weber           Asia  8-15-2022
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T0027

In [19]:
import pandas as pd

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# 1. Remove duplicates from the datasets
customers = customers.drop_duplicates()
products = products.drop_duplicates()
transactions = transactions.drop_duplicates()

# 2. Handle missing values
# - Remove rows with missing CustomerID or ProductID in transactions
transactions = transactions.dropna(subset=['CustomerID', 'ProductID'])

# - Remove rows where CustomerID is missing in customers
customers = customers.dropna(subset=['CustomerID'])

# 3. Convert date columns to datetime format
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'], errors='coerce')
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'], errors='coerce')

# 4. Optionally, reset index after cleaning
customers = customers.reset_index(drop=True)
products = products.reset_index(drop=True)
transactions = transactions.reset_index(drop=True)

# Display the cleaned data (first 5 rows) to verify
print("Cleaned Customers Data:")
print(customers.head())
print("\nCleaned Products Data:")
print(products.head())
print("\nCleaned Transactions Data:")
print(transactions.head())


Cleaned Customers Data:
  CustomerID        CustomerName         Region SignupDate
0      C0001    Lawrence Carroll  South America 2022-07-10
1      C0002      Elizabeth Lutz           Asia 2022-02-13
2      C0003      Michael Rivera  South America 2024-03-07
3      C0004  Kathleen Rodriguez  South America 2022-10-09
4      C0005         Laura Weber           Asia 2022-08-15

Cleaned Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Cleaned Transactions Data:
  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00

In [20]:
# Merge customer and transaction data on 'CustomerID' to associate customer details with each transaction
customer_transactions = pd.merge(transactions, customers, on='CustomerID', how='left')

# Merge the product data with the customer-transaction data on 'ProductID' to get product details
customer_transactions = pd.merge(customer_transactions, products, on='ProductID', how='left')

# Display the first few rows to check the merged data
print("Merged Data (Customer-Transaction-Product Info):")
print(customer_transactions.head())


Merged Data (Customer-Transaction-Product Info):
  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00272      C0087      P067 2024-03-26 22:55:37         2   
4        T00363      C0070      P067 2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe 2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia 2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe 2024-04-04   
3      601.36   300.68  Travis Campbell  South America 2024-04-11   
4      902.04   300.68    Timothy Perez         Europe 2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68

In [21]:
# Rename Price_y to Price
customer_transactions = customer_transactions.rename(columns={'Price_y': 'Price'})

# Drop Price_x if it's redundant
customer_transactions = customer_transactions.drop(columns=['Price_x'])

# Display the cleaned-up data
print("Cleaned Merged Data with Price column updated:")
print(customer_transactions.head())


Cleaned Merged Data with Price column updated:
  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00272      C0087      P067 2024-03-26 22:55:37         2   
4        T00363      C0070      P067 2024-03-21 15:10:10         3   

   TotalValue     CustomerName         Region SignupDate  \
0      300.68   Andrea Jenkins         Europe 2022-12-03   
1      300.68  Brittany Harvey           Asia 2024-09-04   
2      300.68  Kathryn Stevens         Europe 2024-04-04   
3      601.36  Travis Campbell  South America 2024-04-11   
4      902.04    Timothy Perez         Europe 2022-03-15   

                       ProductName     Category   Price  
0  ComfortLiving Bluetooth Speaker  Electronics  300.68  
1  ComfortLiving Bluetooth Speaker  Electronics  300.68

In [22]:
# Drop unnecessary columns
customer_transactions_cleaned = customer_transactions.drop(columns=['TransactionID', 'CustomerName', 'SignupDate', 'Region'])

# Display the cleaned data
print("Final Cleaned Data for Lookalike Model:")
print(customer_transactions_cleaned.head())


Final Cleaned Data for Lookalike Model:
  CustomerID ProductID     TransactionDate  Quantity  TotalValue  \
0      C0199      P067 2024-08-25 12:38:23         1      300.68   
1      C0146      P067 2024-05-27 22:23:54         1      300.68   
2      C0127      P067 2024-04-25 07:38:55         1      300.68   
3      C0087      P067 2024-03-26 22:55:37         2      601.36   
4      C0070      P067 2024-03-21 15:10:10         3      902.04   

                       ProductName     Category   Price  
0  ComfortLiving Bluetooth Speaker  Electronics  300.68  
1  ComfortLiving Bluetooth Speaker  Electronics  300.68  
2  ComfortLiving Bluetooth Speaker  Electronics  300.68  
3  ComfortLiving Bluetooth Speaker  Electronics  300.68  
4  ComfortLiving Bluetooth Speaker  Electronics  300.68  


In [24]:
from datetime import datetime

# Ensure that SignupDate is in datetime format
customer_transactions['SignupDate'] = pd.to_datetime(customer_transactions['SignupDate'])

# Step 1: Calculate YearsWithBusiness
current_date = datetime.now()
customer_transactions['YearsWithBusiness'] = (current_date - customer_transactions['SignupDate']).dt.days / 365

# Step 2: Total Spend - Sum of TotalValue for each customer
total_spend = customer_transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spend = total_spend.rename(columns={'TotalValue': 'TotalSpend'})

# Step 3: Average Quantity Purchased - Average of Quantity for each customer
avg_quantity = customer_transactions.groupby('CustomerID')['Quantity'].mean().reset_index()
avg_quantity = avg_quantity.rename(columns={'Quantity': 'AvgQuantityPurchased'})

# Step 4: Most Frequent Category - Category that the customer purchases the most
category_freq = customer_transactions.groupby(['CustomerID', 'Category'])['Quantity'].sum().reset_index()
most_frequent_category = category_freq.loc[category_freq.groupby('CustomerID')['Quantity'].idxmax()]
most_frequent_category = most_frequent_category[['CustomerID', 'Category']]

# Step 5: Product Diversity - Count of unique products bought by each customer
product_diversity = customer_transactions.groupby('CustomerID')['ProductID'].nunique().reset_index()
product_diversity = product_diversity.rename(columns={'ProductID': 'ProductDiversity'})

# Step 6: Combine all features into a customer profile matrix
customer_profile = pd.merge(customer_transactions[['CustomerID', 'Region', 'YearsWithBusiness']], total_spend, on='CustomerID', how='left')
customer_profile = pd.merge(customer_profile, avg_quantity, on='CustomerID', how='left')
customer_profile = pd.merge(customer_profile, product_diversity, on='CustomerID', how='left')

# Add Most Frequent Category
customer_profile = pd.merge(customer_profile, most_frequent_category, on='CustomerID', how='left')

# Fill missing values with 0 (if no transactions)
customer_profile = customer_profile.fillna(0)

# Display the customer profile matrix
print(customer_profile.head())


  CustomerID         Region  YearsWithBusiness  TotalSpend  \
0      C0199         Europe           2.150685     1979.28   
1      C0146           Asia           0.394521     2570.80   
2      C0127         Europe           0.813699     3232.88   
3      C0087  South America           0.794521     6604.23   
4      C0070         Europe           2.871233     3125.49   

   AvgQuantityPurchased  ProductDiversity     Category  
0              2.250000                 4   Home Decor  
1              2.000000                 4        Books  
2              1.833333                 6  Electronics  
3              3.142857                 7  Electronics  
4              3.000000                 4        Books  


In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Assuming your customer profile data is already loaded in the 'customer_profile' DataFrame

# Step 1: Normalize numerical features
scaler = StandardScaler()
numerical_features = ['YearsWithBusiness', 'TotalSpend', 'AvgQuantityPurchased', 'ProductDiversity']
customer_profile[numerical_features] = scaler.fit_transform(customer_profile[numerical_features])

# Step 2: Convert categorical features to numeric
customer_profile['Region'] = customer_profile['Region'].astype('category').cat.codes
customer_profile['MostFrequentCategory'] = customer_profile['MostFrequentCategory'].astype('category').cat.codes

# Step 3: Prepare the feature matrix for similarity calculation
features = ['Region', 'YearsWithBusiness', 'TotalSpend', 'AvgQuantityPurchased', 'ProductDiversity', 'MostFrequentCategory']
X = customer_profile[features]

# Step 4: Calculate cosine similarity between customers
cosine_sim = cosine_similarity(X)

# Step 5: Create a dictionary to store the top 3 lookalikes for each customer
lookalikes = {}

# For each customer, find the top 3 most similar customers
for idx, row in customer_profile.iterrows():
    customer_id = row['CustomerID']
    
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score in descending order, excluding the customer itself
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get top 3 lookalikes (excluding the customer itself)
    top_lookalikes = [(customer_profile.iloc[i[0]]['CustomerID'], i[1]) for i in similarity_scores[1:4]]
    
    # Store the results in the dictionary
    lookalikes[customer_id] = top_lookalikes

# Step 6: Format the results into a DataFrame for CSV export
lookalike_df = pd.DataFrame(
    [(cust_id, 
      lookalikes[cust_id][0][0], lookalikes[cust_id][0][1], 
      lookalikes[cust_id][1][0], lookalikes[cust_id][1][1], 
      lookalikes[cust_id][2][0], lookalikes[cust_id][2][1]) 
     for cust_id in lookalikes], 
    columns=['CustomerID', 'Lookalike1', 'Similarity1', 'Lookalike2', 'Similarity2', 'Lookalike3', 'Similarity3']
)

# Step 7: Export the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the first 20 customers' top lookalikes
print(lookalike_df.head(20))


KeyError: 'MostFrequentCategory'

In [31]:
# Step 6: Create a dictionary to store the top 3 lookalikes for each customer
lookalikes = {}

# Step 7: Calculate cosine similarities
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between all customers
cosine_sim = cosine_similarity(customer_profile[features])  # Use only the numeric features

# Step 8: Iterate over each customer and find their top 3 lookalikes
for idx, row in customer_profile.iterrows():
    customer_id = row['CustomerID']
    
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score in descending order (exclude self-similarity)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Exclude the self-similarity (the first customer will have a similarity of 1)
    top_lookalikes = []
    count = 0
    for i in similarity_scores:
        if count == 3:
            break
        if customer_profile.iloc[i[0]]['CustomerID'] != customer_id:
            top_lookalikes.append((customer_profile.iloc[i[0]]['CustomerID'], i[1]))
            count += 1

    # Store the results in the dictionary
    lookalikes[customer_id] = top_lookalikes

# Step 9: Format the results into a DataFrame for CSV export
lookalike_df = pd.DataFrame(
    [(cust_id, 
      lookalikes[cust_id][0][0], lookalikes[cust_id][0][1], 
      lookalikes[cust_id][1][0], lookalikes[cust_id][1][1], 
      lookalikes[cust_id][2][0], lookalikes[cust_id][2][1]) 
     for cust_id in lookalikes], 
    columns=['CustomerID', 'Lookalike1', 'Similarity1', 'Lookalike2', 'Similarity2', 'Lookalike3', 'Similarity3']
)

# Step 10: Export the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the first 20 customers' top lookalikes
print(lookalike_df.head(20))


   CustomerID Lookalike1  Similarity1 Lookalike2  Similarity2 Lookalike3  \
0       C0199      C0060     0.967580      C0025     0.943137      C0025   
1       C0146      C0056     0.973158      C0056     0.973158      C0056   
2       C0127      C0172     0.986666      C0172     0.986666      C0172   
3       C0087      C0046     0.928190      C0046     0.928190      C0046   
4       C0070      C0074     0.927349      C0074     0.927349      C0074   
5       C0188      C0102     0.989530      C0102     0.989530      C0102   
6       C0195      C0012     0.987574      C0012     0.987574      C0012   
7       C0008      C0051     0.908467      C0051     0.908467      C0051   
8       C0157      C0076     0.968601      C0076     0.968601      C0076   
9       C0130      C0042     0.980591      C0042     0.980591      C0042   
10      C0051      C0059     0.965693      C0059     0.965693      C0059   
11      C0075      C0156     0.858256      C0156     0.858256      C0156   
12      C015

In [32]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Assuming you have already created the `customer_profile` DataFrame and the similarity matrix

# Calculate cosine similarity between all customers based on the numeric features
cosine_sim = cosine_similarity(customer_profile[features])

# Dictionary to store the top 3 lookalikes for each customer
lookalikes = {}

# Loop through the first 20 customers to identify top 3 lookalikes
for idx, row in customer_profile.head(20).iterrows():
    customer_id = row['CustomerID']
    
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score in descending order (excluding self-similarity)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Collect the top 3 lookalikes
    top_lookalikes = []
    count = 0
    for i in similarity_scores:
        if count == 3:
            break
        if customer_profile.iloc[i[0]]['CustomerID'] != customer_id:
            top_lookalikes.append((customer_profile.iloc[i[0]]['CustomerID'], i[1]))
            count += 1
    
    # Store the result in the dictionary
    lookalikes[customer_id] = top_lookalikes

# Create a DataFrame for the output
lookalike_df = pd.DataFrame(
    [(cust_id, lookalikes[cust_id]) for cust_id in lookalikes], 
    columns=['CustomerID', 'Lookalikes']
)

# Save the result to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the result (top 3 lookalikes for first 20 customers)
print(lookalike_df.head(20))


   CustomerID                                         Lookalikes
0       C0199  [(C0060, 0.9675802645771927), (C0025, 0.943137...
1       C0146  [(C0056, 0.9731582692752938), (C0056, 0.973158...
2       C0127  [(C0172, 0.9866659499226321), (C0172, 0.986665...
3       C0087  [(C0046, 0.9281896971144319), (C0046, 0.928189...
4       C0070  [(C0074, 0.9273494996680743), (C0074, 0.927349...
5       C0188  [(C0102, 0.9895303482968874), (C0102, 0.989530...
6       C0195  [(C0012, 0.9875740124980804), (C0012, 0.987574...
7       C0008  [(C0051, 0.9084667235002782), (C0051, 0.908466...
8       C0157  [(C0076, 0.9686007823213314), (C0076, 0.968600...
9       C0130  [(C0042, 0.9805906575928739), (C0042, 0.980590...
10      C0051  [(C0059, 0.9656934197496126), (C0059, 0.965693...
11      C0075  [(C0156, 0.858256029555297), (C0156, 0.8582560...
12      C0155  [(C0004, 0.9453961792622294), (C0004, 0.945396...
13      C0092  [(C0115, 0.9467158660692199), (C0115, 0.946715...
14      C0088  [(C0128, 0

In [33]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Assuming customer_profile DataFrame already has the processed customer features

# Step 1: Calculate cosine similarity matrix between customers
features = ['Region', 'YearsWithBusiness', 'TotalSpend', 'AvgQuantityPurchased', 'ProductDiversity', 'MostFrequentCategory']
cosine_sim = cosine_similarity(customer_profile[features])

# Step 2: Prepare a dictionary to store the lookalikes for the first 20 customers
lookalikes = {}

# Step 3: Loop through the first 20 customers to get the top 3 lookalikes
for idx, row in customer_profile.head(20).iterrows():
    customer_id = row['CustomerID']
    
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score in descending order (ignoring self-similarity)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Collect the top 3 lookalikes with their similarity scores
    top_lookalikes = []
    count = 0
    for i in similarity_scores:
        if count == 3:
            break
        if customer_profile.iloc[i[0]]['CustomerID'] != customer_id:
            top_lookalikes.append((customer_profile.iloc[i[0]]['CustomerID'], i[1]))
            count += 1
    
    # Store the result in the dictionary
    lookalikes[customer_id] = top_lookalikes

# Step 4: Convert the lookalikes dictionary to a DataFrame
lookalike_df = pd.DataFrame(
    [(cust_id, lookalikes[cust_id]) for cust_id in lookalikes], 
    columns=['CustomerID', 'Lookalikes']
)

# Step 5: Save the result to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the first few entries (top 3 lookalikes for the first 20 customers)
print(lookalike_df.head(20))


   CustomerID                                         Lookalikes
0       C0199  [(C0060, 0.9675802645771927), (C0025, 0.943137...
1       C0146  [(C0056, 0.9731582692752938), (C0056, 0.973158...
2       C0127  [(C0172, 0.9866659499226321), (C0172, 0.986665...
3       C0087  [(C0046, 0.9281896971144319), (C0046, 0.928189...
4       C0070  [(C0074, 0.9273494996680743), (C0074, 0.927349...
5       C0188  [(C0102, 0.9895303482968874), (C0102, 0.989530...
6       C0195  [(C0012, 0.9875740124980804), (C0012, 0.987574...
7       C0008  [(C0051, 0.9084667235002782), (C0051, 0.908466...
8       C0157  [(C0076, 0.9686007823213314), (C0076, 0.968600...
9       C0130  [(C0042, 0.9805906575928739), (C0042, 0.980590...
10      C0051  [(C0059, 0.9656934197496126), (C0059, 0.965693...
11      C0075  [(C0156, 0.858256029555297), (C0156, 0.8582560...
12      C0155  [(C0004, 0.9453961792622294), (C0004, 0.945396...
13      C0092  [(C0115, 0.9467158660692199), (C0115, 0.946715...
14      C0088  [(C0128, 0

In [36]:
import pandas as pd

# Set pandas display options to show full precision
pd.set_option('display.float_format', '{:.16f}'.format)

# After setting this, the similarity scores will display fully without truncation.
print(lookalike_df)


   CustomerID                                         Lookalikes
0       C0199  [(C0060, 0.9675802645771927), (C0025, 0.943137...
1       C0146  [(C0056, 0.9731582692752938), (C0056, 0.973158...
2       C0127  [(C0172, 0.9866659499226321), (C0172, 0.986665...
3       C0087  [(C0046, 0.9281896971144319), (C0046, 0.928189...
4       C0070  [(C0074, 0.9273494996680743), (C0074, 0.927349...
5       C0188  [(C0102, 0.9895303482968874), (C0102, 0.989530...
6       C0195  [(C0012, 0.9875740124980804), (C0012, 0.987574...
7       C0008  [(C0051, 0.9084667235002782), (C0051, 0.908466...
8       C0157  [(C0076, 0.9686007823213314), (C0076, 0.968600...
9       C0130  [(C0042, 0.9805906575928739), (C0042, 0.980590...
10      C0051  [(C0059, 0.9656934197496126), (C0059, 0.965693...
11      C0075  [(C0156, 0.858256029555297), (C0156, 0.8582560...
12      C0155  [(C0004, 0.9453961792622294), (C0004, 0.945396...
13      C0092  [(C0115, 0.9467158660692199), (C0115, 0.946715...
14      C0088  [(C0128, 0

In [38]:
import pandas as pd

pd.set_option('display.max_colwidth', None)  # To ensure full content in each column is shown


print(lookalike_df)


   CustomerID  \
0       C0199   
1       C0146   
2       C0127   
3       C0087   
4       C0070   
5       C0188   
6       C0195   
7       C0008   
8       C0157   
9       C0130   
10      C0051   
11      C0075   
12      C0155   
13      C0092   
14      C0088   
15      C0109   
16      C0041   
17      C0101   
18      C0154   
19      C0200   

                                                                                 Lookalikes  
0     [(C0060, 0.9675802645771927), (C0025, 0.943137156872213), (C0025, 0.943137156872213)]  
1   [(C0056, 0.9731582692752938), (C0056, 0.9731582692752938), (C0056, 0.9731582692752938)]  
2   [(C0172, 0.9866659499226321), (C0172, 0.9866659499226321), (C0172, 0.9866659499226321)]  
3   [(C0046, 0.9281896971144319), (C0046, 0.9281896971144319), (C0046, 0.9281896971144319)]  
4   [(C0074, 0.9273494996680743), (C0074, 0.9273494996680743), (C0074, 0.9273494996680743)]  
5   [(C0102, 0.9895303482968874), (C0102, 0.9895303482968874), (C0102, 0.98953

In [43]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Assuming customer_profile DataFrame already has the processed customer features

# Step 1: Calculate cosine similarity matrix between customers
features = ['Region', 'YearsWithBusiness', 'TotalSpend', 'AvgQuantityPurchased', 'ProductDiversity', 'MostFrequentCategory']
cosine_sim = cosine_similarity(customer_profile[features])

# Step 2: Prepare a dictionary to store the lookalikes for the first 20 customers
lookalikes = {}

# Step 3: Loop through the first 20 customers to get the top 3 lookalikes
for idx, row in customer_profile.head(20).iterrows():
    customer_id = row['CustomerID']
    
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score in descending order (ignoring self-similarity)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Collect the top 3 lookalikes with their similarity scores
    top_lookalikes = []
    count = 0
    for i in similarity_scores:
        if count == 3:
            break
        if customer_profile.iloc[i[0]]['CustomerID'] != customer_id:
            # Convert the similarity score to a float and round it
            score = round(i[1], 4)  # Rounding to 4 decimal places
            top_lookalikes.append((customer_profile.iloc[i[0]]['CustomerID'], score))
            count += 1
    
    # Store the result in the dictionary
    lookalikes[customer_id] = top_lookalikes

# Step 4: Convert the lookalikes dictionary to a DataFrame with a Map structure
lookalike_df = pd.DataFrame(
    [(cust_id, str(lookalikes[cust_id])) for cust_id in lookalikes], 
    columns=['CustomerID', 'Lookalikes']
)

# Step 5: Save the result to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the first few entries (top 3 lookalikes for the first 20 customers)
print(lookalike_df.head(20))


   CustomerID  \
0       C0199   
1       C0146   
2       C0127   
3       C0087   
4       C0070   
5       C0188   
6       C0195   
7       C0008   
8       C0157   
9       C0130   
10      C0051   
11      C0075   
12      C0155   
13      C0092   
14      C0088   
15      C0109   
16      C0041   
17      C0101   
18      C0154   
19      C0200   

                                                                                       Lookalikes  
0   [('C0060', np.float64(0.9676)), ('C0025', np.float64(0.9431)), ('C0025', np.float64(0.9431))]  
1   [('C0056', np.float64(0.9732)), ('C0056', np.float64(0.9732)), ('C0056', np.float64(0.9732))]  
2   [('C0172', np.float64(0.9867)), ('C0172', np.float64(0.9867)), ('C0172', np.float64(0.9867))]  
3   [('C0046', np.float64(0.9282)), ('C0046', np.float64(0.9282)), ('C0046', np.float64(0.9282))]  
4   [('C0074', np.float64(0.9273)), ('C0074', np.float64(0.9273)), ('C0074', np.float64(0.9273))]  
5   [('C0102', np.float64(0.9895)), ('C010

In [44]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Assuming customer_profile DataFrame already has the processed customer features

# Step 1: Calculate cosine similarity matrix between customers
features = ['Region', 'YearsWithBusiness', 'TotalSpend', 'AvgQuantityPurchased', 'ProductDiversity', 'MostFrequentCategory']
cosine_sim = cosine_similarity(customer_profile[features])

# Step 2: Prepare a dictionary to store the lookalikes for the first 20 customers
lookalikes = {}

# Step 3: Loop through the first 20 customers to get the top 3 lookalikes
for idx, row in customer_profile.head(20).iterrows():
    customer_id = row['CustomerID']
    
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score in descending order (ignoring self-similarity)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Collect the top 3 lookalikes with their similarity scores
    top_lookalikes = []
    count = 0
    for i in similarity_scores:
        if count == 3:
            break
        if customer_profile.iloc[i[0]]['CustomerID'] != customer_id:
            # Convert np.float64 to regular float and round it
            score = round(float(i[1]), 4)  # Convert to float and round to 4 decimal places
            top_lookalikes.append((customer_profile.iloc[i[0]]['CustomerID'], score))
            count += 1
    
    # Store the result in the dictionary
    lookalikes[customer_id] = top_lookalikes

# Step 4: Convert the lookalikes dictionary to a DataFrame with a Map structure
lookalike_df = pd.DataFrame(
    [(cust_id, str(lookalikes[cust_id])) for cust_id in lookalikes], 
    columns=['CustomerID', 'Lookalikes']
)

# Step 5: Save the result to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the first few entries (top 3 lookalikes for the first 20 customers)
print(lookalike_df.head(20))


   CustomerID                                                 Lookalikes
0       C0199  [('C0060', 0.9676), ('C0025', 0.9431), ('C0025', 0.9431)]
1       C0146  [('C0056', 0.9732), ('C0056', 0.9732), ('C0056', 0.9732)]
2       C0127  [('C0172', 0.9867), ('C0172', 0.9867), ('C0172', 0.9867)]
3       C0087  [('C0046', 0.9282), ('C0046', 0.9282), ('C0046', 0.9282)]
4       C0070  [('C0074', 0.9273), ('C0074', 0.9273), ('C0074', 0.9273)]
5       C0188  [('C0102', 0.9895), ('C0102', 0.9895), ('C0102', 0.9895)]
6       C0195  [('C0012', 0.9876), ('C0012', 0.9876), ('C0012', 0.9876)]
7       C0008  [('C0051', 0.9085), ('C0051', 0.9085), ('C0051', 0.9085)]
8       C0157  [('C0076', 0.9686), ('C0076', 0.9686), ('C0076', 0.9686)]
9       C0130  [('C0042', 0.9806), ('C0042', 0.9806), ('C0042', 0.9806)]
10      C0051  [('C0059', 0.9657), ('C0059', 0.9657), ('C0059', 0.9657)]
11      C0075  [('C0156', 0.8583), ('C0156', 0.8583), ('C0156', 0.8583)]
12      C0155  [('C0004', 0.9454), ('C0004', 0.9454