# Chapter 2 – Essential DataFrame Operations

This notebook accompanies Chapter 2 of the *Cookbook for Pandas*, demonstrating core DataFrame operations using a synthetic healthcare dataset.

In [2]:
# Imports
import pandas as pd

# Load dataset
df = pd.read_csv('data/health_data.csv', parse_dates=['AdmissionDate', 'DischargeDate'])
df.head()

Unnamed: 0,PatientID,Name,Age,Department,Diagnosis,AdmissionDate,DischargeDate,BloodPressure,HeartRate
0,1001,Norma Fisher,62,Oncology,Fracture,2023-01-28,2023-01-06,110,108
1,1002,Jorge Sullivan,65,Oncology,Fever,2023-01-15,2023-02-07,172,107
2,1003,Elizabeth Woods,82,Cardiology,Stroke,2023-02-11,2023-03-05,158,70
3,1004,Susan Wagner,85,Orthopedics,Stroke,2023-02-28,2023-02-15,112,105
4,1005,Peter Montgomery,85,Pediatrics,Fever,2023-01-02,2023-02-04,173,100


## Accessing and Modifying DataFrames

In [3]:
# Filter cardiology patients
df[df['Department'] == 'Cardiology']

# Update diagnosis for a specific patient
df.loc[df['PatientID'] == 1004, 'Diagnosis'] = 'Lung Cancer'
df.loc[df['PatientID'] == 1004]

Unnamed: 0,PatientID,Name,Age,Department,Diagnosis,AdmissionDate,DischargeDate,BloodPressure,HeartRate
3,1004,Susan Wagner,85,Orthopedics,Lung Cancer,2023-02-28,2023-02-15,112,105


## Adding, Dropping, and Renaming Columns

In [5]:
# Add a new 'RiskLevel' column
def calculate_risk(row):
    if row['BloodPressure'] > 140 or row['HeartRate'] > 100:
        return 'High'
    return 'Normal'

df['RiskLevel'] = df.apply(calculate_risk, axis=1)

# Drop 'Name' column for privacy
df = df.drop(columns='Name')

# Rename 'Diagnosis' to 'MedicalCondition'
df = df.rename(columns={'Diagnosis': 'MedicalCondition'})
df.head()

KeyError: "['Name'] not found in axis"

## Sorting and Reindexing

In [5]:
# Sort by admission date
df_sorted = df.sort_values(by='AdmissionDate')
df_sorted[['PatientID', 'AdmissionDate']].head()

Unnamed: 0,PatientID,AdmissionDate
16,1017,2023-01-01
93,1094,2023-01-02
4,1005,2023-01-02
11,1012,2023-01-03
27,1028,2023-01-03


## Applying Functions with apply(), map(), and applymap()

In [6]:
# Calculate length of stay
df['LengthOfStay'] = (df['DischargeDate'] - df['AdmissionDate']).dt.days

# Simplify medical condition
df['MedicalCondition'] = df['MedicalCondition'].map(lambda x: 'Other' if x not in ['Stroke', 'Hypertension'] else x)
df[['PatientID', 'LengthOfStay', 'MedicalCondition']].head()

Unnamed: 0,PatientID,LengthOfStay,MedicalCondition
0,1001,-22,Other
1,1002,23,Other
2,1003,22,Stroke
3,1004,-13,Other
4,1005,33,Other


## Handling Missing Data

In [7]:
# Introduce some missing values for demo
import numpy as np
df.loc[5:10, 'BloodPressure'] = np.nan

# Fill missing BloodPressure values with mean per department
df['BloodPressure'] = df.groupby('Department')['BloodPressure'].transform(lambda x: x.fillna(x.mean()))
df.head(12)

Unnamed: 0,PatientID,Age,Department,MedicalCondition,AdmissionDate,DischargeDate,BloodPressure,HeartRate,RiskLevel,LengthOfStay
0,1001,62,Oncology,Other,2023-01-28,2023-01-06,110.0,108,High,-22
1,1002,65,Oncology,Other,2023-01-15,2023-02-07,172.0,107,High,23
2,1003,82,Cardiology,Stroke,2023-02-11,2023-03-05,158.0,70,High,22
3,1004,85,Orthopedics,Other,2023-02-28,2023-02-15,112.0,105,High,-13
4,1005,85,Pediatrics,Other,2023-01-02,2023-02-04,173.0,100,High,33
5,1006,27,Oncology,Other,2023-02-06,2023-01-14,144.071429,90,Normal,-23
6,1007,39,Oncology,Other,2023-01-11,2023-01-13,144.071429,87,High,2
7,1008,54,Orthopedics,Other,2023-01-23,2023-02-26,133.888889,103,High,34
8,1009,88,Oncology,Other,2023-02-13,2023-02-09,144.071429,91,Normal,-4
9,1010,30,Orthopedics,Other,2023-02-10,2023-01-31,133.888889,80,Normal,-10


## Copying and Modifying DataFrames

In [8]:
# Create a copy for simulation
df_sim = df.copy()
df_sim['Age'] += 5
df_sim.head()

Unnamed: 0,PatientID,Age,Department,MedicalCondition,AdmissionDate,DischargeDate,BloodPressure,HeartRate,RiskLevel,LengthOfStay
0,1001,67,Oncology,Other,2023-01-28,2023-01-06,110.0,108,High,-22
1,1002,70,Oncology,Other,2023-01-15,2023-02-07,172.0,107,High,23
2,1003,87,Cardiology,Stroke,2023-02-11,2023-03-05,158.0,70,High,22
3,1004,90,Orthopedics,Other,2023-02-28,2023-02-15,112.0,105,High,-13
4,1005,90,Pediatrics,Other,2023-01-02,2023-02-04,173.0,100,High,33


## Recipes

In [9]:
# Recipe: Flag Critical Patients
df['CriticalFlag'] = df.apply(lambda row: row['BloodPressure'] > 140 or row['HeartRate'] > 100, axis=1)
df[['PatientID', 'BloodPressure', 'HeartRate', 'CriticalFlag']].head()

Unnamed: 0,PatientID,BloodPressure,HeartRate,CriticalFlag
0,1001,110.0,108,True
1,1002,172.0,107,True
2,1003,158.0,70,True
3,1004,112.0,105,True
4,1005,173.0,100,True
