Plan for the Lookalike Model:

Data Preparation:
Merge the Customers.csv, Transactions.csv, and Products.csv datasets to create a complete view of customer transactions and profiles.
Aggregate transaction data (e.g., total spending, most common product category, frequency of transactions) for each customer.

Feature Engineering:
Create features such as total spend, average transaction value, most purchased categories, and region encoding.


Similarity Calculation:
Use a distance metric like cosine similarity or Euclidean distance on the feature vectors.
Model Implementation:

Recommend the top 3 similar customers for the first 20 customers based on similarity scores.

In [24]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets (ensure these are loaded in your environment)
transactions_df = pd.read_csv('Transactions.csv')
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')

# Merge customer, transaction, and product data
data = pd.merge(transactions_df, customers_df, on='CustomerID')
data = pd.merge(data, products_df, on='ProductID')

# Aggregate transaction data by customer
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'
}).reset_index()

customer_features.columns = ['CustomerID', 'TotalSpend', 'AvgTransactionValue', 'TransactionCount', 'TopCategory']

# One-hot encode categorical features (Region, TopCategory)
# handle_unknown='ignore' to avoid errors with unseen categories during prediction
region_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_region = region_encoder.fit_transform(customers_df[['Region']])

category_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_category = category_encoder.fit_transform(customer_features[['TopCategory']])

# Combine features into a single DataFrame
customer_features = pd.merge(customer_features, customers_df[['CustomerID', 'Region']], on='CustomerID')
customer_features = pd.concat(
    [customer_features, pd.DataFrame(encoded_region, columns=region_encoder.get_feature_names_out(['Region'])),
     pd.DataFrame(encoded_category, columns=category_encoder.get_feature_names_out(['TopCategory']))], axis=1
).drop(['Region', 'TopCategory'], axis=1)

# Handle NaN values before calculating cosine similarity
# Impute NaN values with the mean of each column (a common strategy)
for column in customer_features.select_dtypes(include=np.number).columns:
    customer_features[column] = customer_features[column].fillna(customer_features[column].mean())

# Compute similarity using cosine similarity
feature_matrix = customer_features.drop('CustomerID', axis=1).values
similarity_matrix = cosine_similarity(feature_matrix)

# Extract top 3 lookalikes for the first 20 customers
lookalike_results = {}

for i, customer_id in enumerate(customer_features['CustomerID'][:20]):
    # Get similarity scores for this customer
    similarities = list(enumerate(similarity_matrix[i]))
    # Exclude self-comparison and sort by similarity score
    top_3_similar = sorted(similarities[1:], key=lambda x: x[1], reverse=True)[:3]
    # Map to customer IDs and scores
    lookalike_results[customer_id] = [(customer_features['CustomerID'][j], score) for j, score in top_3_similar]

# Save results to Lookalike.csv
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': str(lookalikes)}
    for cust_id, lookalikes in lookalike_results.items()
])

lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike Model completed successfully. Results saved in Lookalike.csv.")


Lookalike Model completed successfully. Results saved in Lookalike.csv.


sample for my another approch

In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets (ensure these are loaded in your environment)
transactions_df = pd.read_csv('Transactions.csv')
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')

# Merge customer, transaction, and product data
data = pd.merge(transactions_df, customers_df, on='CustomerID')
data = pd.merge(data, products_df, on='ProductID')

# Aggregate transaction data by customer
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'
}).reset_index()

customer_features.columns = ['CustomerID', 'TotalSpend', 'AvgTransactionValue', 'TransactionCount', 'TopCategory']

# One-hot encode categorical features (Region, TopCategory)
region_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_region = region_encoder.fit_transform(customers_df[['Region']])

category_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_category = category_encoder.fit_transform(customer_features[['TopCategory']])

# Combine features into a single DataFrame
customer_features = pd.merge(customer_features, customers_df[['CustomerID', 'Region']], on='CustomerID')
customer_features = pd.concat(
    [customer_features, pd.DataFrame(encoded_region, columns=region_encoder.get_feature_names_out(['Region'])),
     pd.DataFrame(encoded_category, columns=category_encoder.get_feature_names_out(['TopCategory']))], axis=1
).drop(['Region', 'TopCategory'], axis=1)

# Handle NaN values before calculating cosine similarity
for column in customer_features.select_dtypes(include=np.number).columns:
    customer_features[column] = customer_features[column].fillna(customer_features[column].mean())

# Normalize features to ensure fair similarity comparisons
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Compute similarity using cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Extract top 3 lookalikes for the first 20 customers
lookalike_results = {}

for i, customer_id in enumerate(customer_features['CustomerID'][:20]):
    # Get similarity scores for this customer
    similarities = list(enumerate(similarity_matrix[i]))
    # Exclude self-comparison and sort by similarity score
    top_3_similar = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]
    # Map to customer IDs and scores
    lookalike_results[customer_id] = [(customer_features['CustomerID'][j], round(score, 4)) for j, score in top_3_similar]

# Save results to Lookalike.csv
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': str(lookalikes)}
    for cust_id, lookalikes in lookalike_results.items()
])

lookalike_df.to_csv('Lookalike.csv', index=False)

# Display sample results for validation
print("Sample Lookalike Results:")
print(lookalike_df.head())
print("\nLookalike Model completed successfully. Results saved in Lookalike.csv.")

Sample Lookalike Results:
  CustomerID                                         Lookalikes
0      C0001  [('C0048', 0.9843), ('C0039', 0.9711), ('C0091...
1      C0002  [('C0088', 0.9881), ('C0134', 0.9776), ('C0106...
2      C0003  [('C0052', 0.9955), ('C0152', 0.9799), ('C0189...
3      C0004  [('C0155', 0.9791), ('C0165', 0.9737), ('C0169...
4      C0005  [('C0146', 0.9853), ('C0007', 0.9678), ('C0140...

Lookalike Model completed successfully. Results saved in Lookalike.csv.
