Data Science Internship at Zeotap-Task2

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

# Loading the datasets into pandas data frame from the csv files
customers_df = pd.read_csv(r"C:\Users\91996\Downloads\Customers.csv")
products_df = pd.read_csv(r"C:\Users\91996\Downloads\Products.csv")
transactions_df = pd.read_csv(r"C:\Users\91996\Downloads\Transactions - Transactions.csv")

# Data Preprocessing

# Converting 'SignupDate' and 'TransactionDate' to datetime format (Dataframe object)
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
today = transactions_df['TransactionDate'].max()

# Merging transactions with product information
transactions_df = transactions_df.merge(products_df[['ProductID', 'Category', 'Price']], on='ProductID')

# Creating 'Recency', 'Frequency', 'Monetary' (RFM) features for each customer
recency_df = transactions_df.groupby('CustomerID')['TransactionDate'].max().reset_index()
recency_df['Recency'] = (today - recency_df['TransactionDate']).dt.days
recency_df = recency_df.drop(columns=['TransactionDate'])

frequency_df = transactions_df.groupby('CustomerID')['TransactionID'].nunique().reset_index()
frequency_df.columns = ['CustomerID', 'Frequency']

monetary_df = transactions_df.groupby('CustomerID')['TotalValue'].sum().reset_index()
monetary_df.columns = ['CustomerID', 'Monetary']

# Merging the RFM features (Transactions data) with the customer data
rfm_df = recency_df.merge(frequency_df, on='CustomerID').merge(monetary_df, on='CustomerID')
customer_features_df = customers_df[['CustomerID', 'Region']].merge(rfm_df, on='CustomerID')

# One-hot encode categorical 'Region' using OneHotEncoder
encoder = OneHotEncoder(sparse=False)
region_encoded = encoder.fit_transform(customer_features_df[['Region']])
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))
customer_features_df = pd.concat([customer_features_df, region_df], axis=1).drop(columns=['Region'])

# Standardizing the features
scaler = StandardScaler()
rfm_features = customer_features_df[['Recency', 'Frequency', 'Monetary']]
scaled_rfm = scaler.fit_transform(rfm_features)
scaled_rfm_df = pd.DataFrame(scaled_rfm, columns=['Recency', 'Frequency', 'Monetary'])
customer_features_df[['Recency', 'Frequency', 'Monetary']] = scaled_rfm_df

# Calculating the cosine similarity 
similarity_matrix = cosine_similarity(customer_features_df.drop(columns=['CustomerID']))

# Generate the Lookalike Model recommendations
lookalikes = {}

# For customers C0001 to C0020, finding the top 3 most similar customers
for i in range(20):
    customer_id = f'C{i+1:04}'
    customer_idx = customer_features_df[customer_features_df['CustomerID'] == customer_id].index[0]
    similarity_scores = similarity_matrix[customer_idx]
    
    # Sort the similarity scores in descending order and pick the top 3 similar customers (exclude self)
    similar_customers = np.argsort(similarity_scores)[::-1][1:4]  # Exclude the customer itself
    lookalikes[customer_id] = [(customer_features_df.iloc[idx]['CustomerID'], similarity_scores[idx]) for idx in similar_customers]

# Creating csv file to store the result
lookalike_df = []
for customer_id, similar_customers in lookalikes.items():
    for similar_customer, score in similar_customers:
        lookalike_df.append({'CustomerID': customer_id, 'LookalikeCustomerID': similar_customer, 'SimilarityScore': score})

lookalike_df = pd.DataFrame(lookalike_df)
lookalike_df.to_csv('Manan_Gandhi_Lookalike.csv', index=False)

print("Lookalike model generated and saved to Lookalike.csv")

#Hence we have created a lookalike model with the precise accuracy and the similarity betweeen the customers based on the features selected and with standardize scaling


Lookalike model generated and saved to Lookalike.csv


