In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import librosa
import librosa.display
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
import os

## 1. Data Loading and Merging

We'll start by loading our two datasets:
1. `customer_social_profiles.csv`: Contains social media engagement data
2. `customer_transactions.csv`: Contains transaction history

We'll need to:
1. Load both datasets
2. Clean and preprocess the data
3. Merge the datasets based on customer ID
4. Perform feature engineering

In [3]:
# Load the datasets
social_profiles = pd.read_csv('../data/tables/customer_social_profiles.csv')
transactions = pd.read_csv('../data/tables/customer_transactions.csv')

# Display basic information about the datasets
print("Customer Social Profiles Dataset:")
print(social_profiles.info())
print("\nCustomer Transactions Dataset:")
print(transactions.info())

Customer Social Profiles Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   customer_id_new          155 non-null    object 
 1   social_media_platform    155 non-null    object 
 2   engagement_score         155 non-null    int64  
 3   purchase_interest_score  155 non-null    float64
 4   review_sentiment         155 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 6.2+ KB
None

Customer Transactions Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id_legacy  150 non-null    int64  
 1   transaction_id      150 non-null    int64  
 2   purchase_amount     150 non-null    int64  
 3   purchase_date       150 non-nul

In [None]:
# Clean and preprocess data
# Convert customer IDs to a common format (removing 'A' prefix from social profiles)
social_profiles['customer_id'] = social_profiles['customer_id_new'].str[1:].astype(int)
transactions['customer_id'] = transactions['customer_id_legacy'].astype(int)

# Merge datasets
merged_data = pd.merge(
    social_profiles,
    transactions,
    on='customer_id',
    how='inner'
)

# Feature engineering
# Convert date to datetime
merged_data['purchase_date'] = pd.to_datetime(merged_data['purchase_date'])

# Create additional features
merged_data['engagement_purchase_ratio'] = merged_data['engagement_score'] / merged_data['purchase_amount']
merged_data['sentiment_numeric'] = merged_data['review_sentiment'].map({'Positive': 1, 'Neutral': 0, 'Negative': -1})

# Display merged dataset info
print("Merged Dataset Information:")
print(merged_data.info())

# Fill missing values 
for col in merged_data.columns:
    if merged_data[col].dtype in ['int64', 'float64']:
        # Use median for robustness (handles skewed distributions better)
        merged_data[col].fillna(merged_data[col].median(), inplace=True)
    else:
        # For categorical columns, fill with mode (most common value)
        merged_data[col].fillna(merged_data[col].mode()[0], inplace=True)

# Save merged dataset
merged_data.to_csv('../data/tables/merged_customer_data.csv', index=False)
print("\nMerged dataset saved to tables folder")


In [None]:
# Load the merged dataset
data = pd.read_csv('../data/tables/merged_customer_data.csv')

# Prepare features and target
features = ['engagement_score', 'purchase_interest_score', 'sentiment_numeric', 
            'purchase_amount', 'customer_rating', 'engagement_purchase_ratio', 
            'social_media_platform']
X = data[features]

# One-hot encode categorical variables
X = pd.get_dummies(X, columns=['social_media_platform'])

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(data['product_category'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
y_pred_proba = model.predict_proba(X_test)
loss = log_loss(y_test, y_pred_proba)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"Log Loss: {loss:.2f}")

# Function to predict product for a new customer
def predict_product(customer_data):
    customer_df = pd.DataFrame([customer_data])
    customer_df = pd.get_dummies(customer_df, columns=['social_media_platform'])
    # Align columns with training data
    customer_df = customer_df.reindex(columns=X.columns, fill_value=0)
    prediction = model.predict(customer_df)
    return le.inverse_transform(prediction)[0]

# Function to get aggregated customer data based on identified ID
def get_customer_data(customer_id, current_platform=None):
    # Filter data for the identified customer
    customer_data = data[data['customer_id'] == customer_id]
    
    if current_platform and current_platform in customer_data['social_media_platform'].values:
        # Filter by current platform if provided and available
        customer_data = customer_data[customer_data['social_media_platform'] == current_platform]
    
    # Aggregate numerical features (mean values)
    aggregated_data = {
        'engagement_score': customer_data['engagement_score'].mean(),
        'purchase_interest_score': customer_data['purchase_interest_score'].mean(),
        'sentiment_numeric': customer_data['sentiment_numeric'].mean(),
        'purchase_amount': customer_data['purchase_amount'].mean(),
        'customer_rating': customer_data['customer_rating'].mean(),
        'engagement_purchase_ratio': customer_data['engagement_purchase_ratio'].mean(),
        'social_media_platform': customer_data['social_media_platform'].mode()[0] if not current_platform else current_platform
    }
    return aggregated_data

# Simulate facial recognition and run recommendation
if __name__ == "__main__":
    # Simulate facial recognition (replace with actual model output)
    identified_customer_id = 190  # Example: User identified as customer_id 190 (A190)

    customer_data = get_customer_data(identified_customer_id)
    print(f"Customer Data: {customer_data}")
    
    # Predict product
    recommended_product = predict_product(customer_data)
    print(f"Recommended Product: {recommended_product}")