# Data Analysis and Visualization Examples

This notebook demonstrates common data analysis and visualization techniques using Python libraries including Pandas, Seaborn, and Matplotlib.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set style and seed
sns.set(style="whitegrid")
np.random.seed(1)

## 1. Working with DataFrames

Let's start by creating a simple DataFrame with categorical data:

In [None]:
# Create sample DataFrame
data = {
    'ID': [101, 102, 103, 104, 105],
    'Category': ['Apple', 'Banana', 'Apple', 'Orange', 'Banana'],
    'Score': [88, 92, 85, 90, 87]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

### Categorical Data Operations

In [None]:
# Convert to categorical
df['Category'] = df['Category'].astype('category')

# Show categories
print("Categories:", df['Category'].cat.categories)

# Show value counts
print("\nCategory Counts:")
print(df['Category'].value_counts())

## 2. Data Joining Example

Demonstrating a full outer join between two DataFrames:

In [None]:
# Create two sample DataFrames
df1 = pd.DataFrame({
    'key': ['A', 'B', 'C'],
    'val1': [1, 2, 3]
})

df2 = pd.DataFrame({
    'key': ['B', 'C', 'D'],
    'val2': [4, 5, 6]
})

# Perform full outer join
result = pd.merge(df1, df2, on='key', how='outer')
print("Full Outer Join Result:")
print(result)

## 3. Visualization Examples

### Auto Data Visualizations

In [None]:
# Create sample auto data
auto = pd.DataFrame({
    'disp': np.random.uniform(100, 300, 100),
    'acc': np.random.uniform(10, 30, 100)
})

# Create pairplot
sns.pairplot(auto[['disp', 'acc']])
plt.suptitle("Pairplot: Displacement vs Acceleration", y=1.02)
plt.show()

### Box Plot Example

In [None]:
# Create data for boxplot
years = list(range(70, 80))
data = {
    'yr': [year for year in years for _ in range(50)],
    'cyl': [4,6,8]*167
}
auto_box = pd.DataFrame(data)

plt.figure(figsize=(12, 6))
sns.boxplot(x='yr', y='cyl', data=auto_box)
plt.title('Distribution of Cylinders by Year')
plt.xlabel('Year')
plt.ylabel('Number of Cylinders')
plt.show()

### Strip Plot Example

In [None]:
# Create data for stripplot
data = {
    'wt': [2500 + i*10 for i in range(500)],
    'cyl': [4]*200 + [6]*150 + [8]*100 + [3]*30 + [5]*20
}
auto_strip = pd.DataFrame(data)

plt.figure(figsize=(12, 6))
sns.stripplot(x='wt', y='cyl', data=auto_strip)
plt.title('Car Weight by Number of Cylinders')
plt.xlabel('Weight')
plt.ylabel('Number of Cylinders')
plt.show()

### Histogram Example

In [None]:
# Create histogram
auto_melt = auto[['disp', 'acc']].melt(var_name='variable', value_name='value')

plt.figure()
sns.histplot(data=auto_melt, x='value', hue='variable', multiple='dodge', bins=20)
plt.title("Distribution of Displacement and Acceleration")
plt.show()