In [1]:
import numpy as np
import pandas as pd
import matplotlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score


## Read the data

In [2]:
Cust_data=pd.read_csv('Customers.csv')
Prod_data=pd.read_csv('Products.csv')
Trans_data=pd.read_csv('Transactions.csv')

## Load and Aggregate Data based on requirements

In [3]:

# Merge datasets
transactions = pd.merge(Trans_data, Prod_data, on="ProductID", how="left")
data = pd.merge(transactions, Cust_data, on="CustomerID", how="left")

# Aggregate transaction data for each customer
customer_features = data.groupby("CustomerID").agg(
        total_spent=( "TotalValue",'sum'),
        transaction_count=("TransactionID", "count"),
        distinct_products=("ProductID", "nunique")
    ).reset_index()
scaler = MinMaxScaler()
customer_features_scaled = scaler.fit_transform(customer_features.iloc[:, 1:])

## Display some samples 

In [4]:
customer_features.head()

Unnamed: 0,CustomerID,total_spent,transaction_count,distinct_products
0,C0001,3354.52,5,5
1,C0002,1862.74,4,4
2,C0003,2725.38,4,4
3,C0004,5354.88,8,8
4,C0005,2034.24,3,3


In [5]:
transactions

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,SoundWave Smartwatch,Electronics,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,SoundWave Smartwatch,Electronics,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,SoundWave Smartwatch,Electronics,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,SoundWave Smartwatch,Electronics,459.86


In [6]:
customer_features_scaled

array([[0.30894178, 0.4       , 0.44444444],
       [0.16809501, 0.3       , 0.33333333],
       [0.24954138, 0.3       , 0.33333333],
       [0.49780626, 0.7       , 0.77777778],
       [0.18428723, 0.2       , 0.22222222],
       [0.39137101, 0.3       , 0.33333333],
       [0.2357983 , 0.2       , 0.22222222],
       [0.39552906, 0.9       , 1.        ],
       [0.07686723, 0.2       , 0.22222222],
       [0.15438686, 0.3       , 0.33333333],
       [0.34439282, 0.4       , 0.44444444],
       [0.48613465, 0.6       , 0.55555556],
       [0.5587683 , 0.6       , 0.66666667],
       [0.02231032, 0.        , 0.        ],
       [0.10150772, 0.1       , 0.11111111],
       [0.35802827, 0.4       , 0.44444444],
       [0.44105704, 0.7       , 0.77777778],
       [0.44370349, 0.4       , 0.44444444],
       [0.19859869, 0.5       , 0.55555556],
       [0.07931353, 0.        , 0.        ],
       [0.50394703, 0.7       , 0.77777778],
       [0.43173353, 0.5       , 0.55555556],
       [0.

## Get cosine_similarity and finding the 3 similar customer ID's

In [None]:
customer_ids=customer_features["CustomerID"]
similarity_matrix = cosine_similarity(customer_features_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

# Generate lookalike results
lookalike_results = {}
for customer_id in similarity_df.index:
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False)[1:4]
    lookalike_results[customer_id] = list(zip(similar_customers.index, similar_customers.values))

## Display Relust

In [28]:
lookalike_results#Dictonary of all customers with values of its similar customers

{'C0001': [(Index(['C0173', 'C0137', 'C0152'], dtype='object', name='CustomerID'),),
  (array([1.        , 0.99999624, 0.9999924 ]),)],
 'C0002': [(Index(['C0103', 'C0024', 'C0034'], dtype='object', name='CustomerID'),),
  (array([0.99999961, 0.99999889, 0.99999661]),)],
 'C0003': [(Index(['C0155', 'C0107', 'C0178'], dtype='object', name='CustomerID'),),
  (array([0.99999971, 0.99999777, 0.99999645]),)],
 'C0004': [(Index(['C0164', 'C0021', 'C0166'], dtype='object', name='CustomerID'),),
  (array([0.99999953, 0.9999886 , 0.99994149]),)],
 'C0005': [(Index(['C0193', 'C0100', 'C0013'], dtype='object', name='CustomerID'),),
  (array([0.99999999, 0.99999286, 0.99998868]),)],
 'C0006': [(Index(['C0079', 'C0148', 'C0114'], dtype='object', name='CustomerID'),),
  (array([0.99999591, 0.99990557, 0.99988547]),)],
 'C0007': [(Index(['C0082', 'C0085', 'C0171'], dtype='object', name='CustomerID'),),
  (array([0.99999999, 0.99999354, 0.99997827]),)],
 'C0008': [(Index(['C0047', 'C0111', 'C0157'], d

In [31]:
# Generate lookalike results for 20 customers.
lookalike_results = {}
for customer_id in similarity_df.index[:20]:
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False)[1:4]
    lookalike_results[customer_id] = list(zip(similar_customers.index, similar_customers.values))
lookalike_datafeame=pd.DataFrame(lookalike_results)

In [32]:
# Create a DataFrame for the results
lookalike_dataframe = pd.DataFrame({
    'cust_id': lookalike_results.keys(),
    'lookalikes': [str(value) for value in lookalike_results.values()]  # Convert the list of tuples to strings
})

# Save to CSV
lookalike_dataframe.to_csv('Lookalike_Mapping.csv', index=False)


## Display one sample

In [10]:
lookalike_results['C0001']

[('C0173', np.float64(0.999999999087016)),
 ('C0137', np.float64(0.9999962407175139)),
 ('C0152', np.float64(0.9999924026959218))]

In [15]:
def get_lookalike_customers(new_customer_features,old_customer_features):
    old_customer_features.loc[len(old_customer_features)]=new_customer_features
    customer_features=old_customer_features.copy()
    customer_ids=customer_features["CustomerID"]
    similarity_matrix = cosine_similarity(customer_features_scaled)
    similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

    # Generate lookalike results
    lookalike_results = {}
    customer_id = similarity_df.index[-1]
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False)[1:4]
    lookalike_results[customer_id] = list(zip(similar_customers.index, similar_customers.values))
    return lookalike_results
    



In [None]:
new_customer_features=customer_features.iloc[-1]
old_customer_features=customer_features.iloc[:-1]
lookalike=get_lookalike_customers(new_customer_features,old_customer_features)

In [17]:
lookalike

{'C0200': [('C0018', np.float64(0.9999971875357765)),
  ('C0141', np.float64(0.9999949744327875)),
  ('C0042', np.float64(0.9999932041386305))]}

In [18]:
lookalike_results['C0200']

[('C0018', np.float64(0.9999971875357765)),
 ('C0141', np.float64(0.9999949744327875)),
 ('C0042', np.float64(0.9999932041386305))]