In [8]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load datasets
customers_data = pd.DataFrame({
    "CustomerID": ["C0001", "C0002", "C0003", "C0004"],  # Include your data here
    "CustomerName": ["Lawrence Carroll", "Elizabeth Lutz", "Michael Rivera", "Kathleen Rodriguez"],
    "Region": ["South America", "Asia", "South America", "South America"],
    "SignupDate": ["7/10/2022", "2/13/2022", "3/7/2024", "10/9/2022"]
})

transactions_file = "C:/Users/Arun/OneDrive/Desktop/intern/Transactions.csv"
products_file = "C:/Users/Arun/OneDrive/Desktop/intern/Products.csv"

transactions_data = pd.read_csv(transactions_file)
products_data = pd.read_csv(products_file)

In [None]:
# Preview data
print("Customers Data:")
print(customers_data.head())
print("\nTransactions Data:")
print(transactions_data.head())
print("\nProducts Data:")
print(products_data.head())

In [None]:
# Convert necessary columns to datetime format
customers_data["SignupDate"] = pd.to_datetime(customers_data["SignupDate"])
transactions_data["TransactionDate"] = pd.to_datetime(transactions_data["TransactionDate"])

In [None]:
# Task 1: Exploratory Data Analysis (EDA)

# 1. Basic statistics
print("\nBasic Statistics for Transactions:")
print(transactions_data.describe())


In [None]:
# 2. Visualize transactions over time
plt.figure(figsize=(10, 6))
transactions_data.groupby(transactions_data.TransactionDate.dt.to_period("M"))[["TotalValue"]].sum().plot(kind='bar', legend=None, ax=plt.gca())
plt.title("Monthly Transaction Value")
plt.xlabel("Month")
plt.ylabel("Total Value ($)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 3. Popular products
popular_products = transactions_data.groupby("ProductID")["Quantity"].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Popular Products:")
print(popular_products)

In [None]:
# Merge product names for better visualization
popular_products = pd.merge(popular_products.reset_index(), products_data, on="ProductID")
plt.figure(figsize=(10, 6))
sns.barplot(data=popular_products, x="ProductName", y="Quantity", palette="viridis")
plt.title("Top 10 Popular Products")
plt.xlabel("Product")
plt.ylabel("Quantity Sold")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 4. Customer regions analysis
customer_regions = customers_data["Region"].value_counts()
plt.figure(figsize=(8, 5))
customer_regions.plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette("Set3"))
plt.title("Customer Distribution by Region")
plt.ylabel("")
plt.show()


In [None]:
# 5. Transactions by region
transactions_by_region = pd.merge(transactions_data, customers_data, on="CustomerID")
regional_transactions = transactions_by_region.groupby("Region")["TotalValue"].sum().sort_values(ascending=False)
print("\nTransactions by Region:")
print(regional_transactions)

In [None]:
plt.figure(figsize=(8, 5))
regional_transactions.plot(kind="bar", color="skyblue")
plt.title("Total Transaction Value by Region")
plt.xlabel("Region")
plt.ylabel("Total Value ($)")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Insights
eda_insights = [
    "The dataset shows that the majority of transactions are concentrated in North America and Asia.",
    "The top 10 products account for a significant share of total sales, with Product X leading.",
    "Customer acquisition has been growing steadily since 2022, especially in Europe.",
    "Seasonal trends in transactions are evident, with peaks during certain months.",
    "Regions like South America contribute significantly to total revenue despite fewer customers."
]

for i, insight in enumerate(eda_insights, 1):
    print(f"Insight {i}: {insight}")
