>>>> # Lookalike Model

In [53]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import StandardScaler

**Loading the datasets**

In [57]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [58]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [59]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [60]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


**Assuming the common column 'CustomerID' in both Customers and Transactions**

In [63]:
merged_data = pd.merge(transactions, customers, on='CustomerID', how='inner')
merged_data = pd.merge(merged_data, products, on='ProductID', how='inner')

In [64]:
# Aggregate total spending per customer
customer_spending = merged_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    product_count=('ProductID', 'nunique')
).reset_index()

**Creating product preferences by counting how many times a customer bought each product**

In [70]:
product_preferences = merged_data.groupby(['CustomerID', 'ProductID']).size().unstack(fill_value=0)

**Merge customer spending data with product preferences**

In [73]:
customer_data = pd.merge(customer_spending, product_preferences, on='CustomerID', how='left')

### Normalizing relevant features

In [78]:
features_to_normalize = ['total_spending', 'avg_transaction_value', 'product_count']
scaler = StandardScaler()
customer_data[features_to_normalize] = scaler.fit_transform(customer_data[features_to_normalize])

# Cosine Similarity Calculation
cosine_similarity_matrix = cosine_similarity(customer_data[features_to_normalize].values)
cosine_similarity_df = pd.DataFrame(cosine_similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

# Euclidean Distance Calculation
euclidean_distance_matrix = euclidean_distances(customer_data[features_to_normalize].values)
euclidean_distance_df = pd.DataFrame(euclidean_distance_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

**Initializing dictionaries to store the lookalike data**

In [81]:
lookalikes_cosine = {}
lookalikes_euclidean = {}

# For each customer, finding top 3 most similar customers using Cosine Similarity and Euclidean Distance
for customer_id in customer_data['CustomerID']:
    cosine_similarities = cosine_similarity_df[customer_id]
    
    # Sorting cosine similarities in descending order and getting the top 3 most similar customers
    similar_customers_cosine = cosine_similarities.sort_values(ascending=False).iloc[1:4]
    lookalikes_cosine[customer_id] = similar_customers_cosine.index.tolist(), similar_customers_cosine.values.tolist()
    
    # Euclidean distances for a given customer
    euclidean_distances = euclidean_distance_df[customer_id]
    
    # Sort Euclidean distances in ascending order and getting the top 3 closest customers
    closest_customers_euclidean = euclidean_distances.sort_values(ascending=True).iloc[1:4]
    lookalikes_euclidean[customer_id] = closest_customers_euclidean.index.tolist(), closest_customers_euclidean.values.tolist()

**Creating the Lookalike DataFrame for Cosine Similarity**

In [88]:
lookalike_data_cosine = []
for customer_id, (lookalike_ids, scores) in lookalikes_cosine.items():
    row = [customer_id]  
    for i in range(3): 
        row.extend([lookalike_ids[i], scores[i]]) 
    lookalike_data_cosine.append(row)

**Create the Lookalike DataFrame for Euclidean Distance**

In [92]:
lookalike_data_euclidean = []
for customer_id, (lookalike_ids, scores) in lookalikes_euclidean.items():
    row = [customer_id] 
    for i in range(3):
        row.extend([lookalike_ids[i], scores[i]]) 
    lookalike_data_euclidean.append(row)

In [94]:
lookalike_df_cosine = pd.DataFrame(lookalike_data_cosine, columns=['CustomerID', 'LookalikeID1', 'Score1', 'LookalikeID2', 'Score2', 'LookalikeID3', 'Score3'])
lookalike_df_euclidean = pd.DataFrame(lookalike_data_euclidean, columns=['CustomerID', 'LookalikeID1', 'Score1', 'LookalikeID2', 'Score2', 'LookalikeID3', 'Score3'])

**Check the lookalike DataFrame for Cosine Similarity**

In [98]:
print("Lookalikes based on Cosine Similarity:")
print(lookalike_df_cosine)

Lookalikes based on Cosine Similarity:
    CustomerID LookalikeID1    Score1 LookalikeID2    Score2 LookalikeID3  \
0        C0001        C0137  0.996315        C0152  0.981365        C0172   
1        C0002        C0029  0.999666        C0199  0.998948        C0010   
2        C0003        C0178  0.999525        C0005  0.998888        C0144   
3        C0004        C0021  0.999686        C0075  0.999451        C0067   
4        C0005        C0073  0.999479        C0063  0.999043        C0159   
..         ...          ...       ...          ...       ...          ...   
194      C0196        C0006  0.993353        C0079  0.993231        C0117   
195      C0197        C0131  0.999855        C0112  0.999390        C0130   
196      C0198        C0058  0.998830        C0014  0.993840        C0128   
197      C0199        C0031  0.999933        C0192  0.999030        C0002   
198      C0200        C0018  1.000000        C0170  0.999990        C0187   

       Score3  
0    0.962110  
1   

**Checking the lookalike DataFrame for Euclidean Distance**

In [102]:
print("Lookalikes based on Euclidean Distance:")
print(lookalike_df_euclidean)

Lookalikes based on Euclidean Distance:
    CustomerID LookalikeID1    Score1 LookalikeID2    Score2 LookalikeID3  \
0        C0001        C0137  0.022072        C0152  0.031486        C0056   
1        C0002        C0029  0.091239        C0199  0.138324        C0031   
2        C0003        C0178  0.019644        C0035  0.163522        C0146   
3        C0004        C0021  0.049389        C0093  0.342971        C0173   
4        C0005        C0073  0.039640        C0159  0.057531        C0112   
..         ...          ...       ...          ...       ...          ...   
194      C0196        C0168  0.481669        C0066  0.653311        C0138   
195      C0197        C0131  0.022865        C0112  0.046513        C0036   
196      C0198        C0128  0.345290        C0015  0.491141        C0062   
197      C0199        C0031  0.030979        C0192  0.110906        C0002   
198      C0200        C0018  0.023358        C0187  0.146841        C0170   

       Score3  
0    0.172380  
1  

### Saving the lookalike data to CSV

In [106]:
lookalike_df_cosine['SimilarityMetric'] = 'Cosine'
lookalike_df_euclidean['SimilarityMetric'] = 'Euclidean'

# Combine the two DataFrames
combined_df = pd.concat([lookalike_df_cosine, lookalike_df_euclidean])

# Save to a single CSV file
combined_df.to_csv('Nallapu_Naveen_Lookalike.csv', index=False)