# Exploratory Data Analysis on Synthetic Customer Transaction Data

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)


In [None]:

n = 500
data = {
    "Age": np.random.randint(18, 65, n),
    "PurchaseAmount": np.round(np.random.normal(2500, 800, n), 2),
    "TransactionCount": np.random.randint(1, 20, n),
    "Region": np.random.choice(["North", "South", "East", "West"], n)
}
df = pd.DataFrame(data)

df.loc[np.random.choice(df.index, 10), "PurchaseAmount"] = np.nan
df.loc[np.random.choice(df.index, 8), "Age"] = np.nan

df.head()


In [None]:

df.info()


In [None]:

df["Age"].fillna(df["Age"].median(), inplace=True)
df["PurchaseAmount"].fillna(df["PurchaseAmount"].mean(), inplace=True)
df.isnull().sum()


In [None]:

df.describe()


In [None]:

plt.figure(figsize=(8,5))
sns.histplot(df["PurchaseAmount"], bins=30, kde=True)
plt.title("Distribution of Purchase Amount")
plt.show()


In [None]:

region_avg = df.groupby("Region")["PurchaseAmount"].mean().reset_index()

plt.figure(figsize=(8,5))
sns.barplot(data=region_avg, x="Region", y="PurchaseAmount")
plt.title("Average Purchase Amount by Region")
plt.show()



### Analysis Summary

This exploratory data analysis was conducted on a synthetic dataset of 500 customer
transaction records containing numerical variables such as Age, PurchaseAmount,
and TransactionCount, along with a categorical Region variable. Missing values were
handled using median and mean imputation.

Descriptive statistics and visualizations revealed spending variability and
regional differences in average purchase behavior, highlighting the importance
of EDA before advanced modeling.
