In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# **Step 1: Data Preparation**

In [3]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [4]:
# Convert date columns to datetime format
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [5]:
# Merge datasets to create a comprehensive view
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [6]:
# Display the first few rows of the merged dataset
print(merged_data.head())

  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00272      C0087      P067 2024-03-26 22:55:37         2   
4        T00363      C0070      P067 2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe 2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia 2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe 2024-04-04   
3      601.36   300.68  Travis Campbell  South America 2024-04-11   
4      902.04   300.68    Timothy Perez         Europe 2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker  Electronic


# **Step 2: Feature Engineering**
## We will create features that represent customer profiles and their transaction history. This can include:
### 1. Total number of transactions
### 2. Total spending
### 3. Average transaction value
### 4. Most purchased product category
### 5. Recency of last purchase
python

In [7]:
# Feature engineering
customer_features = merged_data.groupby('CustomerID').agg({
    'TransactionID': 'count',  # Total transactions
    'TotalValue': 'sum',       # Total spending
    'TransactionDate': 'max'   # Last purchase date
}).rename(columns={'TransactionID': 'TotalTransactions', 'TotalValue': 'TotalSpending'})

In [8]:
# Convert TransactionDate to datetime (if not already done)
customer_features['TransactionDate'] = pd.to_datetime(customer_features['TransactionDate'])

In [9]:
# Calculate average transaction value
customer_features['AverageTransactionValue'] = customer_features['TotalSpending'] / customer_features['TotalTransactions']

In [10]:
# Calculate recency
current_date = pd.to_datetime('today')
customer_features['Recency'] = (current_date - customer_features['TransactionDate']).dt.days

In [11]:
# Merge with customer demographics
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID')

In [12]:
# Display the features
print(customer_features.head())

  CustomerID  TotalTransactions  TotalSpending     TransactionDate  \
0      C0001                  5        3354.52 2024-11-02 17:04:16   
1      C0002                  4        1862.74 2024-12-03 01:41:41   
2      C0003                  4        2725.38 2024-08-24 18:54:04   
3      C0004                  8        5354.88 2024-12-23 14:13:52   
4      C0005                  3        2034.24 2024-11-04 00:30:22   

   AverageTransactionValue  Recency         Region  
0                  670.904       86  South America  
1                  465.685       55           Asia  
2                  681.345      156  South America  
3                  669.360       35  South America  
4                  678.080       84           Asia  


# **Step 3: Similarity Calculation**

In [13]:
# Select features for similarity calculation
features = customer_features[['TotalTransactions', 'TotalSpending', 'AverageTransactionValue', 'Recency']]
features_scaled = StandardScaler().fit_transform(features)

In [14]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(features_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)

# **Step 4: Generate Recommendations & Output Results**

In [15]:
# Generate recommendations for the first 20 customers
lookalike_recommendations = []

for customer_id in customer_features.index[:20]:
    # Get similarity scores for the current customer
    similar_scores = similarity_df.loc[customer_id].sort_values(ascending=False)
    
    # Get the top 3 similar customers (excluding the customer itself)
    top_similar = similar_scores[similar_scores.index != customer_id].head(3)
    
    # Store the results in a structured way
    for similar_cust_id, score in zip(top_similar.index, top_similar.values):
        lookalike_recommendations.append({
            'CustomerID': customer_id,
            'LookalikeID': similar_cust_id,
            'SimilarityScore': score
        })

# Convert to DataFrame for saving
lookalike_df = pd.DataFrame(lookalike_recommendations)

# Display the recommendations
print(lookalike_df.head())

# Save the lookalike recommendations to a CSV file
lookalike_df.to_csv('Piyush_Kinage_Lookalike.csv', index=False)

   CustomerID  LookalikeID  SimilarityScore
0           0           55         0.994772
1           0          188         0.988805
2           0          189         0.986770
3           1           30         0.998011
4           1           28         0.989579
