In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# import all the datasets

customers = pd.read_csv('/content/sample_data/task/Customers.csv')
products = pd.read_csv('/content/sample_data/task/Products.csv')
transaction = pd.read_csv('/content/sample_data/task/Transactions.csv')
# Merge datasets on ID
data = transaction.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Feature engineering
customer_profiles = data.groupby("CustomerID").agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Region': 'first',
    'Category': lambda x: x.mode()[0]
}).reset_index()

# Encode categorical features for easy modeling
encoder = ColumnTransformer([
    ("region_enc", OneHotEncoder(), ["Region"]),
    ("category_enc", OneHotEncoder(), ["Category"])
], remainder='passthrough')

# Exclude CustomerID from features
customer_features = customer_profiles.drop(columns=["CustomerID"])
customer_matrix = encoder.fit_transform(customer_features)

# Normalize features
scaler = StandardScaler()
customer_matrix_scaled = scaler.fit_transform(customer_matrix)

# Compute similarity
similarity_matrix = cosine_similarity(customer_matrix_scaled)

# Get top 3 similar customers
lookalike_map = {}
for i, customer_id in enumerate(customer_profiles["CustomerID"][:20]):  # First 20 customers as mentioned in the file
    similarity_scores = similarity_matrix[i]
    similar_customers = sorted(
        [(customer_profiles["CustomerID"][j], similarity_scores[j]) for j in range(len(similarity_scores)) if j != i],
        key=lambda x: x[1],
        reverse=True
    )[:3]
    lookalike_map[customer_id] = similar_customers

# Create Lookalike.csv file
lookalike_df = pd.DataFrame({
    "CustomerID": list(lookalike_map.keys()),
    "Lookalikes": [str(v) for v in lookalike_map.values()]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model complete. Results saved to Lookalike.csv")


Lookalike model complete. Results saved to Lookalike.csv


In [4]:
df = pd.read_csv('Lookalike.csv')
df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0184', 0.9982842666091768), ('C0048', 0.99..."
1,C0002,"[('C0088', 0.9989518323935593), ('C0092', 0.98..."
2,C0003,"[('C0076', 0.9832630233514876), ('C0052', 0.97..."
3,C0004,"[('C0169', 0.9822343186427105), ('C0087', 0.97..."
4,C0005,"[('C0186', 0.9990316040148837), ('C0140', 0.99..."


In [5]:
df.shape

(20, 2)

In [6]:
df.tail()

Unnamed: 0,CustomerID,Lookalikes
15,C0016,"[('C0183', 0.9941505985159991), ('C0067', 0.97..."
16,C0017,"[('C0041', 0.9747995486740378), ('C0057', 0.96..."
17,C0018,"[('C0122', 0.9969280407197494), ('C0046', 0.98..."
18,C0019,"[('C0073', 0.9980212842326817), ('C0167', 0.98..."
19,C0020,"[('C0157', 0.9428502187556475), ('C0050', 0.92..."
