1. Import Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: to suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")


2. Load Dataset

In [None]:
# Load the dataset into a Pandas DataFrame
file_path = 'ecommerce_data.csv'  # Update this with your file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()


3. Data Exploration

In [None]:
# Display the first few rows of the dataset
df.head()

# Display the last few rows of the dataset
df.tail()

# Get a concise summary of the DataFrame
df.info()

# Display basic statistics of numeric columns
df.describe()

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")




In [None]:
# Replace 'column_name' with the actual column name
unique_values = df['column_name'].unique()
print(f"Unique values in 'column_name':\n{unique_values}")

In [None]:
# Replace 'column_name' with the actual column name
unique_count = df['column_name'].nunique()
print(f"Number of unique values in 'column_name': {unique_count}")


In [None]:
# Replace 'column_name' with the actual column name
value_counts = df['column_name'].value_counts()
print(f"Value counts in 'column_name':\n{value_counts}")


In [None]:
# Check unique combinations across two or more columns
unique_combinations = df[['column1', 'column2']].drop_duplicates()
print(f"Unique combinations of 'column1' and 'column2':\n{unique_combinations}")


In [None]:
# Display the number of unique values for each column
for col in df.columns:
    print(f"Column: {col}")
    print(f"Number of unique values: {df[col].nunique()}")
    print(f"Unique values:\n{df[col].unique()[:10]}")  # Display first 10 unique values for brevity
    print("-" * 50)


In [None]:
# Replace 'column_name' with your categorical column
most_frequent = df['column_name'].value_counts().idxmax()
least_frequent = df['column_name'].value_counts().idxmin()

print(f"Most frequent value in 'column_name': {most_frequent}")
print(f"Least frequent value in 'column_name': {least_frequent}")


EXAMPLE (extra) fpr product and review data

for Unique Product IDs

In [None]:
print(f"Number of unique products: {df['product_id'].nunique()}")
print(f"Unique Product IDs:\n{df['product_id'].unique()[:10]}")  # Show top 10


for Unique Reviews

In [None]:
print(f"Number of unique reviewers: {df['reviewer_id'].nunique()}")


for Unique Ratings

In [None]:
print(f"Unique ratings: {df['rating'].unique()}")
print(f"Rating counts:\n{df['rating'].value_counts()}")


4. Data Cleaning

In [None]:
# Drop duplicate rows, if necessary
df = df.drop_duplicates()

# Handle missing values (example: dropping rows with missing values)
df = df.dropna()  # Or you can fill missing values with `df.fillna(value)`


5. Check the Distribution of the Data

In [None]:
# Display the distribution of numeric columns
df.hist(figsize=(10, 8), bins=30, edgecolor='black')
plt.tight_layout()
plt.show()

# Count values in categorical columns (replace 'category_column' with actual column name)
if 'category_column' in df.columns:
    print(df['category_column'].value_counts())


6. Univariate Analysis

In [None]:
# Numeric column distribution
sns.histplot(df['numeric_column'], kde=True)
plt.title('Distribution of Numeric Column')
plt.show()

# Categorical column count plot
sns.countplot(x='categorical_column', data=df, order=df['categorical_column'].value_counts().index)
plt.title('Count of Categories')
plt.xticks(rotation=45)
plt.show()


7. Multivariate Analysis

In [None]:
# Correlation heatmap for numeric columns
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Scatter plot for two numeric variables
sns.scatterplot(x='numeric_column_1', y='numeric_column_2', data=df)
plt.title('Scatter Plot')
plt.show()

# Box plot for a categorical and numeric column
sns.boxplot(x='categorical_column', y='numeric_column', data=df)
plt.title('Box Plot')
plt.xticks(rotation=45)
plt.show()


8. Analyzing Product Reviews (if applicable)

In [None]:
# Check the length of reviews (replace 'review_column' with the actual name)
if 'review_column' in df.columns:
    df['review_length'] = df['review_column'].str.len()

    sns.histplot(df['review_length'], kde=True)
    plt.title('Distribution of Review Length')
    plt.show()

# Word Cloud of reviews (optional)
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['review_column'].dropna()))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Reviews')
plt.show()


9. Grouping and Aggregations

In [None]:
# Example: Average rating per product
if 'product_id' in df.columns and 'rating' in df.columns:
    avg_rating = df.groupby('product_id')['rating'].mean().sort_values(ascending=False)
    print(avg_rating.head())

# Example: Total sales by category
if 'category_column' in df.columns and 'sales_column' in df.columns:
    total_sales = df.groupby('category_column')['sales_column'].sum().sort_values(ascending=False)
    print(total_sales)


10. Save Cleaned Data (Optional)

In [None]:
# Save cleaned data to a new CSV file
df.to_csv('cleaned_ecommerce_data.csv', index=False)
print("Cleaned data saved successfully!")
