#Comprehensive Data Analysis for Fictitious Online Bookstore

DATASETS

In [None]:
!pip install faker

Collecting faker
  Downloading Faker-25.8.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-25.8.0


## Sentiment Analysis on Customer Reviews

In [None]:
import pandas as pd
import numpy as np
import random
from faker import Faker

fake = Faker()

# Parameters
num_records = 10000
num_unique_books = 10000  # Each record will have a unique book title, author, and genre

# Generate unique book titles, authors, and genres
book_titles = [fake.sentence(nb_words=4) for _ in range(num_unique_books)]
authors = [fake.name() for _ in range(num_unique_books)]
genres = [fake.word(ext_word_list=["Mystery", "Self-help", "Science Fiction", "Romance", "Cooking"]) for _ in range(num_unique_books)]
ratings = [1, 2, 3, 4, 5]

# Generate data
data = {
    "ReviewID": range(1, num_records + 1),
    "BookID": [random.randint(1001, 11000) for _ in range(num_records)],
    "BookTitle": [random.choice(book_titles) for _ in range(num_records)],
    "Author": [random.choice(authors) for _ in range(num_records)],
    "Genre": [random.choice(genres) for _ in range(num_records)],
    "ReviewText": [fake.text(max_nb_chars=200) for _ in range(num_records)],
    "Rating": [random.choice(ratings) for _ in range(num_records)],
}

reviews_df = pd.DataFrame(data)
reviews_df.to_csv("customer_reviews.csv", index=False)

print("Dataset with 10,000 records created and saved as 'customer_reviews.csv'.")


Dataset with 10,000 records created and saved as 'customer_reviews.csv'.


## Customer Churn Analysis

In [None]:
import pandas as pd
import numpy as np
import random
from faker import Faker

fake = Faker()

# Parameters
num_customers = 10000

# Generate data
data = {
    "CustomerID": range(1, num_customers + 1),
    "Age": np.random.randint(18, 70, size=num_customers),
    "Gender": np.random.choice(["M", "F"], size=num_customers),
    "Location": [f"{fake.city()}, {fake.state()}" for _ in range(num_customers)],
    "PurchaseFrequency": np.random.randint(1, 20, size=num_customers),
    "AvgOrderValue": np.round(np.random.uniform(10, 200, size=num_customers), 2),
    "TimeSinceLastPurchase": np.random.randint(1, 365, size=num_customers),
    "Churn": np.random.choice([0, 1], size=num_customers),
}

churn_df = pd.DataFrame(data)
churn_df.to_csv("customer_churn.csv", index=False)

print("Dataset with 10,000 records created and saved as 'customer_churn.csv'.")


Dataset with 10,000 records created and saved as 'customer_churn.csv'.


## Employee Retention Analysis

In [None]:
# Parameters
num_employees = 10000
departments = ["Sales", "IT", "HR", "Marketing", "Finance"]
yes_no = ["Yes", "No"]

# Generate data
data = {
    "EmployeeID": range(1, num_employees + 1),
    "Age": np.random.randint(18, 65, size=num_employees),
    "Gender": np.random.choice(["M", "F"], size=num_employees),
    "Department": [random.choice(departments) for _ in range(num_employees)],
    "Tenure": np.random.randint(1, 10, size=num_employees),
    "PerformanceScore": np.round(np.random.uniform(1, 5, size=num_employees), 1),
    "JobSatisfaction": np.round(np.random.uniform(1, 5, size=num_employees), 1),
    "EngagementScore": np.round(np.random.uniform(1, 5, size=num_employees), 1),
    "TrainingHours": np.random.randint(0, 100, size=num_employees),
    "PromotionLastYear": [random.choice(yes_no) for _ in range(num_employees)],
    "Turnover": np.random.choice([0, 1], size=num_employees),
}

employees_df = pd.DataFrame(data)
employees_df.to_csv("employee_retention.csv", index=False)


## Sales Forecasting

In [None]:
from datetime import datetime, timedelta

# Parameters
start_date = datetime(2020, 1, 1)
end_date = datetime(2024, 6, 20)
date_range = pd.date_range(start_date, end_date)
num_days = len(date_range)
num_books = 5
promotion_flags = [True, False]
seasons = ["Winter", "Spring", "Summer", "Fall"]

# Generate data
# Use np.tile for 'BookID' and 'BookTitle' to match the length of 'Date'

dates = np.tile(date_range, num_books)
np.random.shuffle(dates)

data = {
    "Date": dates,
    "BookID": np.tile([random.randint(1001, 11000) for _ in range(num_books)], num_days),  # Tile BookID to match Date length
    "BookTitle": np.tile(book_titles[:num_books], num_days),  # Changed to np.tile, sliced book_titles to match num_books
    "DailySales": np.random.randint(1, 50, size=num_days * num_books),
    "Promotions": np.random.choice(promotion_flags, size=num_days * num_books),
    "Season": [seasons[pd.Timestamp(d).month % 12 // 3] for d in np.tile(date_range, num_books)],
}

sales_df = pd.DataFrame(data)
sales_df.to_csv("sales_forecasting.csv", index=False)