# Pandas Fundamentals Notebook

This notebook covers the fundamental concepts of Pandas, including:
- Creating Series and DataFrames
- Indexing with loc and iloc
- Data cleaning techniques
- Aggregation methods

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Creating Series and DataFrames

In this section, we'll learn how to create Series and DataFrames from different data sources.

### 1.1 Creating Series

A Series is a one-dimensional labeled array capable of holding any data type.

In [None]:
# Create a Series from a list
data_list = [10, 20, 30, 40, 50]
series_from_list = pd.Series(data_list)
print("Series from list:")
print(series_from_list)
print()

In [None]:
# Create a Series from a dictionary
data_dict = {'a': 1, 'b': 2, 'c': 3, 'd': 4}
series_from_dict = pd.Series(data_dict)
print("Series from dictionary:")
print(series_from_dict)
print()

In [None]:
# Create a Series with custom index
custom_index = ['first', 'second', 'third', 'fourth']
data = [100, 200, 300, 400]
series_custom_index = pd.Series(data, index=custom_index)
print("Series with custom index:")
print(series_custom_index)
print()

### 1.2 Creating DataFrames

A DataFrame is a two-dimensional, size-mutable, potentially heterogeneous tabular data structure.

In [None]:
# Create DataFrame from a dictionary of lists
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, 30, 35, 28],
    'City': ['New York', 'London', 'Paris', 'Tokyo'],
    'Salary': [50000, 60000, 70000, 55000]
}
df_from_dict = pd.DataFrame(data)
print("DataFrame from dictionary:")
print(df_from_dict)
print()

In [None]:
# Create DataFrame from a list of dictionaries
data_list_of_dicts = [
    {'Name': 'Eve', 'Age': 32, 'City': 'Berlin', 'Salary': 62000},
    {'Name': 'Frank', 'Age': 27, 'City': 'Sydney', 'Salary': 58000},
    {'Name': 'Grace', 'Age': 31, 'City': 'Toronto', 'Salary': 65000}
]
df_from_list_of_dicts = pd.DataFrame(data_list_of_dicts)
print("DataFrame from list of dictionaries:")
print(df_from_list_of_dicts)
print()

In [None]:
# Create DataFrame from a 2D numpy array
array_data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df_from_array = pd.DataFrame(array_data, columns=['Col1', 'Col2', 'Col3'])
print("DataFrame from numpy array:")
print(df_from_array)
print()

## 2. Indexing with loc and iloc

Understanding the difference between label-based indexing (loc) and integer position-based indexing (iloc).

In [None]:
# Create a sample DataFrame for indexing examples
sample_data = {
    'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Headphones'],
    'Price': [999.99, 25.50, 75.00, 299.99, 150.00],
    'Stock': [50, 200, 150, 30, 80],
    'Category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 'Audio']
}
df_sample = pd.DataFrame(sample_data)
print("Sample DataFrame for indexing:")
print(df_sample)
print()

### 2.1 Using loc (label-based indexing)

In [None]:
# Select row by label (index)
print("Row with index 2 using loc:")
print(df_sample.loc[2])
print()

# Select specific cell using loc
print("Price of product at index 3:")
print(df_sample.loc[3, 'Price'])
print()

# Select multiple rows and columns using loc
print("Products 1-3 with Name and Price:")
print(df_sample.loc[1:3, ['Product', 'Price']])
print()

### 2.2 Using iloc (integer position-based indexing)

In [None]:
# Select row by integer position
print("Row at position 2 using iloc:")
print(df_sample.iloc[2])
print()

# Select specific cell using iloc
print("Value at row 3, column 1:")
print(df_sample.iloc[3, 1])
print()

# Select multiple rows and columns using iloc
print("Rows 1-3, columns 0-1:")
print(df_sample.iloc[1:4, 0:2])
print()

### 2.3 Boolean indexing

In [None]:
# Filter rows based on condition
expensive_products = df_sample[df_sample['Price'] > 100]
print("Products with price > 100:")
print(expensive_products)
print()

# Multiple conditions
filtered_products = df_sample[(df_sample['Price'] > 50) & (df_sample['Stock'] < 100)]
print("Products with price > 50 AND stock < 100:")
print(filtered_products)
print()

## 3. Data Cleaning Techniques

Handling missing values, data types, and basic cleaning operations.

In [None]:
# Create a DataFrame with missing values for cleaning examples
cleaning_data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Age': [25, np.nan, 35, 28, 32],
    'City': ['New York', 'London', np.nan, 'Tokyo', 'Berlin'],
    'Salary': [50000, 60000, np.nan, 55000, 62000],
    'Department': ['HR', 'IT', 'Finance', 'HR', 'IT']
}
df_cleaning = pd.DataFrame(cleaning_data)
print("DataFrame with missing values:")
print(df_cleaning)
print()

print("Info about the DataFrame:")
print(df_cleaning.info())
print()

print("Missing values count:")
print(df_cleaning.isnull().sum())
print()

### 3.1 Identifying and handling missing values

In [None]:
# Check for missing values
print("Rows with any missing values:")
print(df_cleaning[df_cleaning.isnull().any(axis=1)])
print()

# Drop rows with any missing values
df_dropped = df_cleaning.dropna()
print("DataFrame after dropping rows with missing values:")
print(df_dropped)
print()

In [None]:
# Fill missing values with mean for numeric columns
df_filled_mean = df_cleaning.copy()
numeric_cols = df_filled_mean.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    df_filled_mean[col].fillna(df_filled_mean[col].mean(), inplace=True)

print("DataFrame after filling numeric missing values with mean:")
print(df_filled_mean)
print()

# Fill categorical missing values with mode
categorical_cols = df_filled_mean.select_dtypes(include=['object']).columns
for col in categorical_cols:
    mode_val = df_filled_mean[col].mode()
    if len(mode_val) > 0:
        df_filled_mean[col].fillna(mode_val[0], inplace=True)

print("DataFrame after filling all missing values:")
print(df_filled_mean)
print()

## 4. Aggregation Methods

Using groupby, pivot tables, and other aggregation functions.

In [None]:
# Create sample data for aggregation examples
sales_data = {
    'Month': ['Jan', 'Feb', 'Mar', 'Jan', 'Feb', 'Mar', 'Jan', 'Feb', 'Mar'],
    'Region': ['North', 'North', 'North', 'South', 'South', 'South', 'East', 'East', 'East'],
    'Sales': [1000, 1200, 1100, 900, 950, 1050, 1100, 1150, 1200],
    'Units': [10, 12, 11, 9, 10, 11, 11, 12, 12]
}
df_sales = pd.DataFrame(sales_data)
print("Sales data for aggregation:")
print(df_sales)
print()

### 4.1 Using groupby for aggregations

In [None]:
# Group by single column and aggregate
monthly_sales = df_sales.groupby('Month')['Sales'].sum()
print("Total sales by month:")
print(monthly_sales)
print()

# Group by multiple columns
regional_monthly_sales = df_sales.groupby(['Region', 'Month'])['Sales'].sum()
print("Total sales by region and month:")
print(regional_monthly_sales)
print()

# Multiple aggregations
agg_results = df_sales.groupby('Region').agg({
    'Sales': ['sum', 'mean', 'count'],
    'Units': ['sum', 'mean']
})
print("Multiple aggregations by region:")
print(agg_results)
print()

### 4.2 Using pivot_table

In [None]:
# Create a pivot table
pivot_result = pd.pivot_table(
    df_sales,
    values='Sales',
    index='Region',
    columns='Month',
    aggfunc='sum',
    fill_value=0
)
print("Pivot table - Sales by Region and Month:")
print(pivot_result)
print()

# Another pivot table example with multiple values
pivot_multi = pd.pivot_table(
    df_sales,
    values=['Sales', 'Units'],
    index='Region',
    columns='Month',
    aggfunc={'Sales': 'sum', 'Units': 'sum'},
    fill_value=0
)
print("Pivot table - Sales and Units by Region and Month:")
print(pivot_multi)
print()

## Exercises and Practice Problems

Try to solve the following exercises to reinforce your understanding:

### Exercise 1: Create a DataFrame
Create a DataFrame with the following data:
- Name: Alice, Bob, Charlie
- Age: 25, 30, 35
- City: New York, London, Paris

In [None]:
# Your solution here



### Exercise 2: Filtering
Filter the sales data to show only records where Sales > 1000.

In [None]:
# Your solution here



### Exercise 3: Aggregation
Calculate the average sales by region.

In [None]:
# Your solution here



---
This notebook covered the fundamental concepts of Pandas. You should now be familiar with:
1. Creating Series and DataFrames
2. Different indexing methods (loc, iloc, boolean indexing)
3. Data cleaning techniques
4. Aggregation methods (groupby, pivot_table)

Continue to the next notebook to apply these concepts to real-world data!