In [15]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [16]:
# Load customer and transaction data
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

# Display the first few rows of the datasets to understand the structure
customers.head(), transactions.head()

(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55         1   
 3        T00272      C0087      P067  2024-03-26 22:55:37         2   
 4        T00363      C0070      P067  2024-03-21 15:10:10         3   
 
    TotalValue   Price  
 0      300.68  300.68  
 1      300.68  300.68  
 2      300.68  300.68  
 3      601.36  300.68  
 4      902.04  300.68  )

In [17]:
# Merge the customer and transaction data on 'CustomerID'
data = pd.merge(customers, transactions, on='CustomerID', how='inner')

# Check for missing values and handle them
data.isnull().sum()

# Fill missing values or drop rows if necessary
data.fillna(0, inplace=True)  # Example: fill missing values with 0


In [18]:
# Create new features for analysis
data['Total Spend'] = data['Quantity'] * data['Price']  # Total spent on each product

# Aggregate data by customer
customer_profile = data.groupby('CustomerID').agg(
    Total_Spend=('Total Spend', 'sum'),
    Average_Spend_Per_Product=('Total Spend', 'mean'),
    Frequent_Products=('ProductID', 'nunique')
).reset_index()

# View the customer profile data
customer_profile.head()


Unnamed: 0,CustomerID,Total_Spend,Average_Spend_Per_Product,Frequent_Products
0,C0001,3354.52,670.904,5
1,C0002,1862.74,465.685,4
2,C0003,2725.38,681.345,4
3,C0004,5354.88,669.36,8
4,C0005,2034.24,678.08,3


In [19]:
# Standardize the numerical features to bring them to a common scale
scaler = StandardScaler()
customer_profile[['Total_Spend', 'Average_Spend_Per_Product', 'Frequent_Products']] = \
    scaler.fit_transform(customer_profile[['Total_Spend', 'Average_Spend_Per_Product', 'Frequent_Products']])

# Check the standardized data
customer_profile.head()


Unnamed: 0,CustomerID,Total_Spend,Average_Spend_Per_Product,Frequent_Products
0,C0001,-0.061701,-0.070263,0.050047
1,C0002,-0.877744,-0.934933,-0.424204
2,C0003,-0.405857,-0.026271,-0.424204
3,C0004,1.032547,-0.076769,1.472798
4,C0005,-0.783929,-0.040028,-0.898455


In [20]:
# Compute cosine similarity between all customers
cosine_sim = cosine_similarity(customer_profile[['Total_Spend', 'Average_Spend_Per_Product', 'Frequent_Products']])

# Convert the similarity matrix into a DataFrame for easier interpretation
cosine_sim_df = pd.DataFrame(cosine_sim, index=customer_profile['CustomerID'], columns=customer_profile['CustomerID'])

# View the cosine similarity matrix for the first few customers
cosine_sim_df.head()


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.688403,0.090776,0.080615,0.049136,-0.884356,-0.496867,0.598817,0.632631,0.713574,...,0.922559,0.635443,-0.41752,0.615274,0.175943,-0.929736,0.152934,0.359567,0.662058,-0.852558
C0002,0.688403,1.0,0.706421,-0.600101,0.686717,-0.728913,0.075793,-0.106457,0.996331,0.998924,...,0.898634,0.99596,0.190403,-0.054029,-0.527867,-0.710944,0.770384,0.914697,0.998948,-0.943863
C0003,0.090776,0.706421,1.0,-0.9847,0.998888,-0.030361,0.759189,-0.743217,0.724012,0.672842,...,0.347706,0.767123,0.829348,-0.724291,-0.963574,-0.010998,0.994218,0.920075,0.738129,-0.434918
C0004,0.080615,-0.600101,-0.9847,1.0,-0.991495,-0.104984,-0.833446,0.846146,-0.629389,-0.562719,...,-0.199467,-0.668821,-0.892057,0.826866,0.995368,-0.135747,-0.971306,-0.868365,-0.635796,0.302179
C0005,0.049136,0.686717,0.998888,-0.991495,1.0,-0.003612,0.775056,-0.770353,0.708032,0.652321,...,0.316169,0.749124,0.843359,-0.750072,-0.974404,0.019977,0.992401,0.913578,0.719276,-0.408734


In [21]:
# Function to get top 3 similar customers for a given customer
def get_top_lookalikes(customer_id, top_n=3):
    # Get the similarity scores for the customer_id
    similar_scores = cosine_sim_df[customer_id].sort_values(ascending=False)
    
    # Exclude the customer themselves
    similar_scores = similar_scores.drop(customer_id)
    
    # Get the top N similar customers
    top_lookalikes = similar_scores.head(top_n)
    
    return top_lookalikes

# Example: Get top 3 lookalikes for customer C0001
get_top_lookalikes('C0001')


CustomerID
C0137    0.996315
C0152    0.981365
C0172    0.962110
Name: C0001, dtype: float64

In [22]:
# Generate top 3 lookalikes for customers C0001 to C0020
lookalike_map = {}

for customer_id in customer_profile['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_lookalikes(customer_id)

# Display the lookalike map
lookalike_map


{'C0001': CustomerID
 C0137    0.996315
 C0152    0.981365
 C0172    0.962110
 Name: C0001, dtype: float64,
 'C0002': CustomerID
 C0029    0.999666
 C0199    0.998948
 C0010    0.998924
 Name: C0002, dtype: float64,
 'C0003': CustomerID
 C0178    0.999525
 C0005    0.998888
 C0144    0.997969
 Name: C0003, dtype: float64,
 'C0004': CustomerID
 C0021    0.999686
 C0075    0.999451
 C0067    0.999392
 Name: C0004, dtype: float64,
 'C0005': CustomerID
 C0073    0.999479
 C0063    0.999043
 C0159    0.998895
 Name: C0005, dtype: float64,
 'C0006': CustomerID
 C0079    0.999985
 C0117    0.998543
 C0196    0.993353
 Name: C0006, dtype: float64,
 'C0007': CustomerID
 C0085    0.999795
 C0140    0.998049
 C0070    0.994156
 Name: C0007, dtype: float64,
 'C0008': CustomerID
 C0194    0.995606
 C0154    0.995072
 C0179    0.990622
 Name: C0008, dtype: float64,
 'C0009': CustomerID
 C0077    0.999835
 C0032    0.998028
 C0083    0.997536
 Name: C0009, dtype: float64,
 'C0010': CustomerID
 C0029 

In [23]:
# Prepare the "Lookalike.csv" file with CustomerID and their top 3 similar customers with scores
lookalike_list = []

for customer_id, similar_customers in lookalike_map.items():
    for similar_customer_id, score in similar_customers.items():
        lookalike_list.append([customer_id, similar_customer_id, score])

# Convert the list into a DataFrame
lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'Recommended_CustomerID', 'Similarity_Score'])

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the top of the CSV
lookalike_df.head()


Unnamed: 0,CustomerID,Recommended_CustomerID,Similarity_Score
0,C0001,C0137,0.996315
1,C0001,C0152,0.981365
2,C0001,C0172,0.96211
3,C0002,C0029,0.999666
4,C0002,C0199,0.998948
