In [72]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

customers = pd.read_csv(r"C:\Users\monalisa\IdeaProjects\Zeotap\Customers.csv")
products = pd.read_csv(r"C:\Users\monalisa\IdeaProjects\Zeotap\Transactions.csv")
transactions = pd.read_csv(r"C:\Users\monalisa\IdeaProjects\Zeotap\Products.csv")



In [80]:
merged_data = transactions.merge(products, on="ProductID", how="left")
merged_data = merged_data.merge(customers, on="CustomerID", how="left")


In [82]:
merged_data.dropna(subset=["CustomerID", "ProductID", "ProductName", "Region", "TotalValue"], inplace=True)


In [85]:
user_product_matrix = merged_data.pivot_table(index="CustomerID", columns="ProductName", values="Quantity", aggfunc="sum",
    fill_value=0
)


In [87]:
#reset index for manipulation
user_product_matrix.reset_index(inplace=True)

# Combine customer profile data with transaction data for similarity
customer_profiles = customers.set_index("CustomerID")
user_product_matrix = user_product_matrix.set_index("CustomerID")
combined_matrix = user_product_matrix.join(customer_profiles, how="left")

# Fill any remaining missing values with 0
combined_matrix.fillna(0, inplace=True)


In [97]:
print(merged_data.columns)

Index(['ProductID', 'ProductName', 'Category', 'Price_x', 'TransactionID',
       'CustomerID', 'TransactionDate', 'Quantity', 'TotalValue', 'Price_y',
       'CustomerName', 'Region', 'SignupDate'],
      dtype='object')


In [101]:
if "Region" not in combined_matrix.columns:
    print("Error: 'Region' column not found. Please check the merged data.")
else:
    # Convert 'Region' to dummy variables
    combined_matrix = pd.get_dummies(combined_matrix, columns=["Region"], drop_first=True)

# Ensure only numeric columns are used for similarity calculation
numeric_matrix = combined_matrix.select_dtypes(include=[int, float])


Error: 'Region' column not found. Please check the merged data.


In [103]:
numeric_matrix.fillna(0, inplace=True)


In [108]:
similarity_matrix = cosine_similarity(numeric_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=combined_matrix.index, columns=combined_matrix.index)
lookalike_results = {}

for customer in combined_matrix.index[:20]:
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]
    lookalike_results[customer] = list(zip(similar_customers.index, similar_customers.values))


In [118]:
lookalike_data = []
for cust_id, similar in lookalike_results.items():
    for sim_cust_id, score in similar:
        lookalike_data.append({
            "CustomerID": cust_id,
            "SimilarCustomerID": sim_cust_id,
            "SimilarityScore": score
        })

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Monalisa_V_Lookalike.csv", index=False)

print("\nLookalike Model Completed. Results saved in 'Monalisa_V_Lookalike.csv'.")


Lookalike Model Completed. Results saved in 'Monalisa_V_Lookalike.csv'.
