In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [3]:
# Merge transactions with customer and product data
merged_df = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [5]:
# Check the columns of the merged DataFrame
print(merged_df.columns)


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [8]:
# Aggregating transaction data per customer with the correct column names
customer_features = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean',  # Update this if the correct column name is 'Price_x' (from Products.csv)
    'Category': lambda x: x.mode()[0],  # Most common category
    'Region': lambda x: x.mode()[0]  # Most common region
}).reset_index()


In [9]:
# Encoding categorical features (Category, Region)
customer_features = pd.get_dummies(customer_features, columns=['Category', 'Region'], drop_first=True)


In [10]:
# Standardizing numerical features
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))


In [11]:
# Compute similarity using cosine similarity
similarity_matrix = cosine_similarity(customer_features_scaled)


In [12]:
# Function to get top 3 similar customers
def get_top_3_lookalikes(customer_id, customer_features_df, similarity_matrix):
    index = customer_features_df[customer_features_df['CustomerID'] == customer_id].index[0]
    sim_scores = list(enumerate(similarity_matrix[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_3_customers = [(customer_features_df.iloc[i[0]]['CustomerID'], round(i[1], 2)) for i in sim_scores[1:4]]
    return top_3_customers


In [13]:
# Generate lookalike recommendations for the first 20 customers
lookalike_results = {}

for cust_id in customers['CustomerID'][:20]:
    lookalike_results[cust_id] = get_top_3_lookalikes(cust_id, customer_features, similarity_matrix)


In [15]:
# Save results to Lookalike.csv
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
lookalike_df.to_csv('NavyaSri_SatyaShanmukhi_Lookalike.csv')

print("Lookalike recommendations saved to FirstName_LastName_Lookalike.csv")


Lookalike recommendations saved to FirstName_LastName_Lookalike.csv
