In [37]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
# Check for missing values and handle them
print("Missing values before handling:")
print(features.isna().sum()) 

Missing values before handling:
CustomerID           0
age                  0
is_male              0
total_spent          0
transaction_count    0
avg_spent            0
Category             1
dtype: int64


In [39]:
# Fill missing values with column mean for numerical columns
features = features.fillna(features.mean())


  features = features.fillna(features.mean())


In [40]:
# Checking if there are still missing values
print("Missing values after handling:")
print(features.isna().sum())

Missing values after handling:
CustomerID           0
age                  0
is_male              0
total_spent          0
transaction_count    0
avg_spent            0
Category             1
dtype: int64


In [41]:
# Creating the feature matrix for similarity calculation
feature_matrix = features[['age', 'is_male', 'total_spent', 'transaction_count', 'avg_spent']]

In [42]:
# Calculating cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

In [43]:
# Getting top 3 similar customers for each of the first 20 customers (C0001 to C0020)
lookalike_dict = {}

In [44]:
for customer_id in range(1, 21):  # For C0001 to C0020
    customer_idx = customer_id - 1  # Adjust index for 0-based indexing
    similarity_scores = similarity_matrix[customer_idx]
    
    # Getting indices of top 3 similar customers, excluding the customer itself
    similar_customer_indices = similarity_scores.argsort()[-4:-1]  # Top 3, excluding self (self is at index 0)
    
    # Preparing the list of top 3 similar customers with their similarity scores
    similar_customers = [(f'C{idx+1:04d}', similarity_scores[idx]) for idx in similar_customer_indices]
    
    # Storing in the dictionary
    lookalike_dict[f'C{customer_id:04d}'] = similar_customers

In [45]:
# Saving it to a CSV file
lookalike_df = pd.DataFrame.from_dict(lookalike_dict, orient='index')
lookalike_df.to_csv('Lookalike.csv', header=False)


In [46]:
# Displaying the Lookalike map
print(lookalike_dict)

{'C0001': [('C0011', 0.9577255957240236), ('C0180', 0.9972870485112197), ('C0152', 0.9996980180004738)], 'C0002': [('C0010', 0.9978769624919364), ('C0199', 0.9982970556285776), ('C0029', 0.9993699721524977)], 'C0003': [('C0177', 0.9817865067669984), ('C0146', 0.990333559548811), ('C0035', 0.9922593409857258)], 'C0004': [('C0108', 0.9813819623640224), ('C0113', 0.981475933953967), ('C0173', 0.9896276476026519)], 'C0005': [('C0112', 0.9980720438403259), ('C0159', 0.9994778705378558), ('C0073', 0.9997529400836948)], 'C0006': [('C0044', 0.9450364709036342), ('C0066', 0.9690347819656097), ('C0117', 0.9967639001336546)], 'C0007': [('C0070', 0.9480766355460091), ('C0135', 0.9485295474782991), ('C0176', 0.978262343947951)], 'C0008': [('C0098', 0.9319696531971403), ('C0093', 0.9381622452120447), ('C0084', 0.9929556413227604)], 'C0009': [('C0097', 0.9875640117232274), ('C0043', 0.9962880646785033), ('C0077', 0.9998319470328094)], 'C0010': [('C0002', 0.9978769624919364), ('C0025', 0.9986952684646