In [1]:
# importing libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [3]:
# load datasets
customers = pd.read_csv(r"C:\Users\brnan\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\brnan\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\brnan\Downloads\Transactions.csv")

In [5]:
# Display shapes of datasets
print(f"Customers: {customers.shape}")
print(f"Products: {products.shape}")
print(f"Transactions: {transactions.shape}")

Customers: (200, 4)
Products: (100, 4)
Transactions: (1000, 7)


In [4]:
# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [6]:
print(f"Merged Data: {merged_data.shape}")

Merged Data: (1000, 13)


* customer feature preparation
  - Aggregate features for each customer:
Combine transaction and product data to create a customer feature set

In [32]:
print(merged_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue     CustomerName         Region  SignupDate  \
0      300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36  Travis Campbell  South America  2024-04-11   
4      902.04    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category   Price  
0  ComfortLiving Bluetooth Speaker  Electronics  300.68  
1  ComfortLiving Bluetooth Speaker  Electronics  300.68  
2  ComfortLiving Bluetooth Speak

In [18]:
#for merging 'price x' and 'price y'
merged_data['Price'] = merged_data['Price_x'].combine_first(merged_data['Price_y'])
merged_data= merged_data.drop(['Price_x', 'Price_y'], axis=1)

In [19]:
# Aggregate transaction features for customers
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'mean',          # Average transaction value
    'Quantity': 'sum',             # Total products purchased
    'Price': 'mean',               # Average product price purchased
    'Category': lambda x: ','.join(x.unique())  # Concatenate unique product categories
}).reset_index()

In [20]:
# Encode categorical data (e.g., categories)
customer_features = pd.concat([customer_features, pd.get_dummies(customer_features['Category'])], axis=1)
customer_features.drop(columns=['Category'], inplace=True)

In [21]:
# Standardize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

In [22]:
# Display final feature set
print(customer_features.head())

  CustomerID  TotalValue  Quantity       Price  Books  Books,Clothing  \
0      C0001     670.904        12  278.334000  False           False   
1      C0002     465.685        10  208.920000  False           False   
2      C0003     681.345        14  195.707500  False           False   
3      C0004     669.360        23  240.636250  False           False   
4      C0005     678.080         7  291.603333  False           False   

   Books,Clothing,Electronics  Books,Clothing,Home Decor,Electronics  \
0                       False                                  False   
1                       False                                  False   
2                       False                                  False   
3                       False                                  False   
4                       False                                  False   

   Books,Electronics  Books,Electronics,Clothing  ...  \
0              False                       False  ...   
1             

# computing similarity scores

In [23]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(scaled_features)

In [24]:
# Convert to DataFrame for readability
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [25]:
# Display the similarity matrix
print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.011792 -0.013804 -0.025803 -0.016797 -0.025678   
C0002      -0.011792  1.000000 -0.000739 -0.021304 -0.012656 -0.063572   
C0003      -0.013804 -0.000739  1.000000 -0.003071 -0.023643 -0.040602   
C0004      -0.025803 -0.021304 -0.003071  1.000000 -0.070953 -0.058445   
C0005      -0.016797 -0.012656 -0.023643 -0.070953  1.000000 -0.023371   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001      -0.016398 -0.021408 -0.007867 -0.021982  ... -0.023458 -0.018001   
C0002      -0.036123  0.000178  0.008669  0.025945  ... -0.013936 -0.003227   
C0003      -0.037141 -0.001582 -0.016580  0.013886  ... -0.024996 -0.023518   
C0004      -0.073905  0.032733 -0.060433 -0.010791  ... -0.052148 -0.067277   
C0005  

# recommend similar customers

In [26]:
# Function to recommend similar customers
def get_similar_customers(customer_id, top_n=3):
    # Get similarity scores for the given customer
    similar_scores = similarity_df[customer_id]
    # Sort by similarity score in descending order and exclude the customer itself
    top_similar = similar_scores.sort_values(ascending=False).iloc[1:top_n+1]
    # Return the top N similar customers with their scores
    return top_similar.index.tolist(), top_similar.values.tolist()

In [27]:
# Test the function
test_customer = customer_features['CustomerID'].iloc[0]
similar_customers, scores = get_similar_customers(test_customer)
print(f"Top similar customers for {test_customer}: {list(zip(similar_customers, scores))}")

Top similar customers for C0001: [('C0035', 0.9961697279374134), ('C0065', 0.9563303320111434), ('C0058', -0.0073516258757234665)]


# Recommendations for the First 20 Customers

In [28]:
lookalike_recommendations = []

for customer_id in customer_features['CustomerID'][:20]:
    similar_customers, scores = get_similar_customers(customer_id)
    # Append the recommendations to the list
    lookalike_recommendations.append({
        "CustomerID": customer_id,
        "Recommendations": list(zip(similar_customers, scores))
    })

In [29]:
recommendations_df = pd.DataFrame(lookalike_recommendations)

In [33]:
# Save to CSV
recommendations_df.to_csv('lookalike.csv', index=False)
print("Lookalike recommendations saved to CSV: Lookalike.csv")

Lookalike recommendations saved to CSV: Lookalike.csv
