In [7]:
from IPython.core.display import display, HTML

html_content = """
<h1>Task 2: Lookalike Model</h1>


<p>Build a Lookalike Model that takes a user's information as input and recommends 3 similar customers based on their profile and transaction history. The model should:
● Use both customer and product information.
● Assign a similarity score to each recommended customer.</p>
"""

display(HTML(html_content))
import pandas as pd

# Load datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")



# Merge transactions with customer and product information
data = transactions.merge(customers, on="CustomerID", how="left").merge(products, on="ProductID", how="left")

# Verify the merged data structure
# print("Merged Data Columns:", data.columns)
# print(data.head())

# Check for valid columns before aggregation
required_columns = ["TotalValue", "Quantity", "Category"]
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    print(f"Warning: Missing columns for aggregation: {missing_columns}")

# Perform aggregation on available columns
customer_profile = data.groupby("CustomerID").agg({
    "TotalValue": "sum",        # Total spending
    "Quantity": "sum",          # Total quantity purchased
    "Category": lambda x: ','.join(x.dropna().unique()),  # Unique product categories
}).reset_index()

# print(customer_profile.head())

from sklearn.preprocessing import LabelEncoder

# Encode categorical features (e.g., Category)
if "Category" in customer_profile.columns:
    encoder = LabelEncoder()
    customer_profile["Category"] = encoder.fit_transform(customer_profile["Category"])

# Check if 'TotalValue' exists; if not, calculate it
if "TotalValue" not in data.columns:
    data["TotalValue"] = data["Quantity"] * data["Price"]

# Perform aggregation
customer_profile = data.groupby("CustomerID").agg({
    "TotalValue": "sum",         # Total spending
    "Quantity": "sum",           # Total quantity purchased
    "Category": lambda x: ','.join(x.dropna().unique()),  # Unique product categories
}).reset_index()

# Verify columns
print("Updated Customer Profile Columns:", customer_profile.columns)
# Ensure "Category" is encoded only if it exists
if "Category" in customer_profile.columns:
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    customer_profile["Category"] = encoder.fit_transform(customer_profile["Category"])
else:
    print("Warning: 'Category' column not found in customer_profile.")

# Select features for similarity computation (check for column existence)
valid_features = [col for col in ["TotalValue", "Quantity", "Category"] if col in customer_profile.columns]

if not valid_features:
    raise ValueError("No valid columns found for similarity computation.")

features = customer_profile[valid_features]

# Compute cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(features)

# Convert to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile["CustomerID"], columns=customer_profile["CustomerID"])

# print("Customer Profile Columns:", customer_profile.columns)
# print(customer_profile.head())
valid_features = [col for col in ["TotalValue", "Quantity", "Category"] if col in customer_profile.columns]

if not valid_features:
    raise ValueError("No valid columns found for similarity computation.")
similarity_matrix = cosine_similarity(features)

# Convert to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile["CustomerID"], columns=customer_profile["CustomerID"])
print(similarity_df)

  from IPython.core.display import display, HTML


Updated Customer Profile Columns: Index(['CustomerID', 'TotalValue', 'Quantity', 'Category'], dtype='object')
CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.999742  0.999888  0.999999  0.999761  0.999999   
C0002       0.999742  1.000000  0.999970  0.999706  0.999998  0.999709   
C0003       0.999888  0.999970  1.000000  0.999864  0.999974  0.999866   
C0004       0.999999  0.999706  0.999864  1.000000  0.999725  0.999999   
C0005       0.999761  0.999998  0.999974  0.999725  1.000000  0.999730   
...              ...       ...       ...       ...       ...       ...   
C0196       0.999994  0.999657  0.999829  0.999997  0.999681  0.999998   
C0197       0.999728  1.000000  0.999965  0.999691  0.999998  0.999695   
C0198       0.999443  0.999940  0.999828  0.999389  0.999934  0.999398   
C0199       0.999879  0.999974  1.000000  0.999854  0.999979  0.999856   
C0