In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Load dataset
customers_df = pd.read_csv(r"Downloads/Customers.csv")
products_df = pd.read_csv(r"Downloads/Products.csv")  
transactions_df = pd.read_csv(r"Downloads/Transactions.csv")

In [8]:
# Merging two tables
customer_profile = pd.merge(customers_df, transactions_df, on='CustomerID', how='left')
customer_profile1 = pd.merge(customer_profile, products_df, on='ProductID', how='left')


In [9]:
customer_profile1.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2.0,114.6,57.3,SoundWave Cookbook,Books,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3.0,412.62,137.54,HomeSense Wall Art,Home Decor,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 00:01:00,2.0,614.94,307.47,SoundWave Headphones,Electronics,307.47
3,C0001,Lawrence Carroll,South America,2022-07-10,T00445,P083,2024-05-07 03:11:44,2.0,911.44,455.72,ActiveWear Smartwatch,Electronics,455.72
4,C0001,Lawrence Carroll,South America,2022-07-10,T00436,P029,2024-11-02 17:04:16,3.0,1300.92,433.64,TechPro Headphones,Electronics,433.64


In [10]:
# Aggregating transaction and product-level data
customer_metrics = customer_profile1.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    avg_spend=('TotalValue', 'mean'),
    total_quantity=('Quantity', 'sum'),
    unique_products=('ProductID', 'nunique')
).reset_index()

In [11]:
# Add customer profile features
customer_features = pd.merge(customer_metrics, customers_df, on='CustomerID', how='left')

In [12]:
# Encode categorical features (e.g., Region)
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

In [18]:
# Create a similarity matrix using cosine similarity
feature_matrix = customer_features.drop(columns=['CustomerID'])
print(feature_matrix.dtypes)
#similarity_matrix = cosine_similarity(feature_matrix)


total_spend             float64
avg_spend               float64
total_quantity          float64
unique_products           int64
CustomerName             object
SignupDate               object
Region_Europe              bool
Region_North America       bool
Region_South America       bool
dtype: object


In [19]:
feature_matrix = feature_matrix.apply(pd.to_numeric, errors='coerce')  # Convert to numeric


In [20]:
feature_matrix.isnull().sum()

total_spend               0
avg_spend                 1
total_quantity            0
unique_products           0
CustomerName            200
SignupDate              200
Region_Europe             0
Region_North America      0
Region_South America      0
dtype: int64

In [21]:
feature_matrix = feature_matrix.fillna(0) 

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

print(feature_matrix.dtypes)  # Verify all columns are numeric



total_spend             float64
avg_spend               float64
total_quantity          float64
unique_products         float64
CustomerName             object
SignupDate               object
Region_Europe              bool
Region_North America       bool
Region_South America       bool
dtype: object


In [26]:
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors

# Create an imputer to fill missing values with the mean (you can also use median or most frequent)
imputer = SimpleImputer(strategy='mean')  # Or strategy='median', 'most_frequent'
X_imputed = imputer.fit_transform(feature_matrix)

# Now use the NearestNeighbors model
nn = NearestNeighbors(n_neighbors=3)
nn.fit(X_imputed)


In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [31]:
features = customer_features.drop(columns=['CustomerID'])  # Drop non-numeric columns
features = pd.get_dummies(features, drop_first=True)  # One-hot encode categorical data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(feature_matrix)


In [32]:
# Apply K-Means Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
customer_features['Cluster'] = kmeans.fit_predict(scaled_features)





In [33]:
# Find Lookalikes for the first 20 customers
lookalikes = {}
for cust_id in customer_features['CustomerID'][:20]:
    cluster = customer_features.loc[customer_features['CustomerID'] == cust_id, 'Cluster'].values[0]
    similar_customers = customer_features[customer_features['Cluster'] == cluster]['CustomerID'].tolist()
    lookalikes[cust_id] = [c for c in similar_customers if c != cust_id][:3]  # Top 3 excluding self



In [34]:
# Print lookalikes
print(lookalikes)

{'C0001': ['C0003', 'C0004', 'C0006'], 'C0002': ['C0005', 'C0007', 'C0025'], 'C0003': ['C0001', 'C0004', 'C0006'], 'C0004': ['C0001', 'C0003', 'C0006'], 'C0005': ['C0002', 'C0007', 'C0025'], 'C0006': ['C0001', 'C0003', 'C0004'], 'C0007': ['C0002', 'C0005', 'C0025'], 'C0008': ['C0017', 'C0021', 'C0022'], 'C0009': ['C0010', 'C0014', 'C0019'], 'C0010': ['C0009', 'C0014', 'C0019'], 'C0011': ['C0001', 'C0003', 'C0004'], 'C0012': ['C0001', 'C0003', 'C0004'], 'C0013': ['C0001', 'C0003', 'C0004'], 'C0014': ['C0009', 'C0010', 'C0019'], 'C0015': ['C0016', 'C0018', 'C0020'], 'C0016': ['C0015', 'C0018', 'C0020'], 'C0017': ['C0008', 'C0021', 'C0022'], 'C0018': ['C0015', 'C0016', 'C0020'], 'C0019': ['C0009', 'C0010', 'C0014'], 'C0020': ['C0015', 'C0016', 'C0018']}


In [44]:
lookalikes_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])

# Save the DataFrame to a CSV file
lookalikes_df.to_csv('customer_lookalikes.csv')