In [34]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import datetime

# Set a seed for reproducibility
np.random.seed(42)

# Function to generate random dates
def generate_random_dates(start_date, end_date, num_dates):
    date_list = [start_date + datetime.timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(num_dates)]
    return date_list

# Generate fake data using Faker library
fake = Faker()

# Number of orders to generate
num_orders = 100

# Generate random data for each column
order_ids = list(range(1, num_orders + 1))
customer_ids = [fake.uuid4() for _ in range(num_orders)]
order_dates = generate_random_dates(datetime.date(2023, 1, 1), datetime.date(2023, 12, 31), num_orders)
product_ids = list(range(1, 101))  # Assuming there are 100 unique products
product_names = [fake.word() for _ in range(100)]  # Assuming there are 100 unique product names
product_prices = [round(random.uniform(5, 100), 2) for _ in range(100)]  # Random prices between 5 and 100
quantities = [random.randint(1, 10) for _ in range(num_orders)]

# Create a DataFrame
data = {
    'order_id': order_ids,
    'customer_id': customer_ids,
    'order_date': order_dates,
    'product_id': np.random.choice(product_ids, num_orders),
    'product_name': np.random.choice(product_names, num_orders),
    'product_price': np.random.choice(product_prices, num_orders),
    'quantity': quantities
}

df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('orders.csv', index=False)


In [35]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('orders.csv')

# Convert 'order_date' column to datetime type
df['order_date'] = pd.to_datetime(df['order_date'])

# Task 1: Compute the total revenue generated by the online store for each month
df['month'] = df['order_date'].dt.to_period('M')
monthly_revenue = df.groupby('month')['product_price'].sum()

# Task 2: Compute the total revenue generated by each product
product_revenue = df.groupby('product_name')['product_price'].sum()

# Task 3: Compute the total revenue generated by each customer
customer_revenue = df.groupby('customer_id')['product_price'].sum()

# Task 4: Identify the top 10 customers by revenue generated
top_customers = customer_revenue.nlargest(10)

# Display the results
print("Task 1: Total revenue generated by the online store for each month")
print(monthly_revenue)
print("\nTask 2: Total revenue generated by each product")
print(product_revenue)
print("\nTask 3: Total revenue generated by each customer")
print(customer_revenue)
print("\nTask 4: Top 10 customers by revenue generated")
print(top_customers)


Task 1: Total revenue generated by the online store for each month
month
2023-01    432.26
2023-02    331.03
2023-03    334.32
2023-04    502.05
2023-05    372.47
2023-06    387.91
2023-07    634.76
2023-08    323.28
2023-09    519.78
2023-10    359.37
2023-11    734.79
2023-12    283.45
Freq: M, Name: product_price, dtype: float64

Task 2: Total revenue generated by each product
product_name
against           28.86
along            256.49
any               81.91
argue             92.22
back              21.30
bank              98.77
begin             67.36
behind           231.20
candidate         17.00
century           16.46
choose            79.47
common           117.12
concern           35.70
cover             60.15
different         98.65
dream             71.26
easy              21.31
election          45.93
especially         7.84
every             37.56
face             125.40
field             35.74
follow            62.42
full              12.79
have              66.66
her 

In [40]:
%run test_script.py

.....
----------------------------------------------------------------------
Ran 5 tests in 0.026s

OK
