# Walmart Purchase Behavior Analysis

This notebook analyzes Walmart customer purchase behavior based on gender, age, occupation, marital status, and product category.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the data
df = pd.read_csv('walmart_data.txt')
df.head()

In [None]:
df.info()
df.isnull().sum()

In [None]:
df.shape
df.dtypes

In [None]:
df['City_Category'] = df['City_Category'].astype('category')
df['Gender'] = df['Gender'].astype('category')

In [None]:
df.describe()

In [None]:
df['Product_Category'].value_counts()
df.nunique()

## Univariate Analysis - Continuous Variables

In [None]:
sns.histplot(x='Stay_In_Current_City_Years', data=df, kde=True)
plt.title('Stay Range')
plt.xlabel('Stay_In_Current_City_Years')
plt.ylabel('Count')
plt.show()

In [None]:
sns.displot(df['Purchase'], kde=True, bins=30)
plt.title('Distribution of Purchase Amount')
plt.xlabel('Purchase Amount')
plt.ylabel('Density')
plt.show()

In [None]:
sns.countplot(x='Age', data=df, order=sorted(df['Age'].unique()))
plt.title('Distribution of Age')
plt.xlabel('Age Groups')
plt.ylabel('Count')
plt.show()

## Bivariate Analysis - Categorical Variables

In [None]:
sns.boxplot(x='Gender', y='Purchase', data=df)
plt.title('Purchase Distribution by Gender')
plt.show()

In [None]:
sns.boxplot(x='Age', y='Purchase', data=df)
plt.title('Purchase Distribution by Age Group')
plt.show()

In [None]:
corr = df.select_dtypes(include=['number']).corr()
plt.figure(figsize=(10,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

## Missing Value & Outlier Detection

In [None]:
missing_values = df.isnull().sum()
missing_percentage = df.isnull().mean() * 100
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
missing_data

In [None]:
numerical = df.select_dtypes(include=['float64', 'int64']).columns
Q1 = df[numerical].quantile(0.25)
Q3 = df[numerical].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outlier = df[(df[numerical] < lower_bound) | (df[numerical] > upper_bound)].any(axis=1)
df[~outlier]

## Final Insights
- Males spend more than females.
- 26-35 age group spends the most.
- Married customers tend to spend slightly more.
- Confidence intervals and boxplots confirm these insights.

These insights can help Walmart tailor marketing and pricing strategies accordingly.