# Part 3: Data Analysis Task

## 1. Loading the Dataset
We start by loading the dataset and examining the first few rows.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('dataset.csv')

# Display first 5 rows
df.head()

## 2. Validation & Cleaning
- Handling missing values (e.g., filling Revenue with 0 for non-purchases).
- Converting `Timestamp` to datetime objects.
- Handling invalid timestamps.
- Renaming columns for consistency.

In [None]:
# Rename columns to standard names
df.rename(columns={'User ID': 'User_ID', 'Product Category': 'Product_Category'}, inplace=True)

# Validate: Check for missing revenue on purchases
if df[(df['Action'] == 'purchased') & (df['Revenue'].isnull())].shape[0] > 0:
    print('Warning: Purchases with missing revenue found!')

# Handle Missing Revenue (Fill with 0)
df['Revenue'] = df['Revenue'].fillna(0)

# Convert Timestamp to datetime, parsing day first (DD-MM-YYYY)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], dayfirst=True, errors='coerce')

# Remove rows with invalid timestamps
df = df.dropna(subset=['Timestamp'])

# Check cleaned info
df.info()

## 3. Analysis

### A. Most Popular Product Categories
We determine popularity by the total number of interactions (views, adds, purchases).

In [None]:
popular_categories = df['Product_Category'].value_counts()
print("Popular Categories (by interaction count):")
print(popular_categories)

# Visualize it
plt.figure(figsize=(8,5))
sns.barplot(x=popular_categories.index, y=popular_categories.values, palette='viridis')
plt.title('Product Category Popularity')
plt.ylabel('Interactions')
plt.show()

### B. Conversion Rate
Conversion Rate = (Unique Users who Purchased / Total Unique Users) * 100

In [None]:
total_users = df['User_ID'].nunique()
# Note: Action is 'purchased' in this dataset
purchasers = df[df['Action'] == 'purchased']['User_ID'].nunique()

conversion_rate = (purchasers / total_users) * 100
print(f"Total Users: {total_users}")
print(f"Purchasers: {purchasers}")
print(f"Conversion Rate: {conversion_rate:.2f}%")

### C. User Behavior Trends Over Time
Grouping activity by date.

In [None]:
df['Date'] = df['Timestamp'].dt.date
daily_trends = df.groupby('Date')['Action'].count()

plt.figure(figsize=(10,6))
sns.lineplot(x=daily_trends.index, y=daily_trends.values, marker='o')
plt.title('Daily User Activity Trends')
plt.xlabel('Date')
plt.ylabel('Number of Actions')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 4. Recommendations
1. **Improve Checkout Flow**: If conversion is low, investigate drop-off after 'added_to_cart'.
2. **Category Focus**: Promote high-engagement categories on the homepage.
3. **Re-engagement**: Target users who viewed but didn't purchase with emails.