In [None]:
# Date: 06-08-2025: 
# Topic: Pandas-II

In [1]:
import pandas as pd
import numpy as np

##                                                    Pandas- Day II

In [None]:
'''
Titanic Dataset (Data Science Dojo): https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv
Iris Dataset (Seaborn Data): https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv
Kaggle Titanic Link: https://www.kaggle.com/datasets/yasserh/titanic-dataset/data

'''

In [None]:
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
iris_dataset_github_url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv"
kaggle_link = "https://www.kaggle.com/datasets/yasserh/titanic-dataset/data"

In [2]:
import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# --- Extracting Data with .iloc ---

# .iloc[row_selection, column_selection]
# Remember: .iloc uses integer-location based indexing, and slices are EXCLUSIVE of the end.

# To get the first 3 rows, we use slicing from 0 up to (but not including) 3: 0:3
# To get the first 2 columns, we use slicing from 0 up to (but not including) 2: 0:2

extracted_data = df_titanic.iloc[0:3, 0:2]

print("\n--- First 3 Rows and First 2 Columns using .iloc ---")
print(extracted_data)

Titanic dataset loaded successfully!

--- First 3 Rows and First 2 Columns using .iloc ---
   PassengerId  Survived
0            1         0
1            2         1
2            3         1


In [3]:
df_titanic.iloc[0:3, [0, 2]]

Unnamed: 0,PassengerId,Pclass
0,1,3
1,2,1
2,3,3


In [5]:
df_titanic.iloc[0:3, [2, 0]] # swap column order

Unnamed: 0,Pclass,PassengerId
0,3,1
1,1,2
2,3,3


In [None]:
'''
The General Rule of Thumb for loc and iloc
In Pandas:

iloc (integer-location based indexing): When you use slicing with iloc (e.g., df.iloc[0:5, 0:2]), 
it generally tries to return a view if possible, for efficiency. 
If you use fancy indexing (e.g., df.iloc[[0, 5], [0, 2]]), it will typically return a copy.


loc (label-based indexing): Similar to iloc, when you use slicing with loc (e.g., df.loc['row_start':'row_end', 'col_start':'col_end']),
it generally tries to return a view. If you use fancy indexing (e.g., df.loc[['row_label1', 'row_label2'], ['col_label1', 'col_label2']])
 or boolean indexing, it will typically return a copy.
 
The behavior of returning a view versus a copy can sometimes be complex and depends on internal optimizations in Pandas. It's not always guaranteed, and Pandas sometimes gives you a SettingWithCopyWarning to alert you to potential issues when you might be modifying a view that was intended to be a copy.

Why is this distinction important?
The core issue is whether modifying the returned object will also modify the original DataFrame.

If it's a View: Modifying the view will also modify the original DataFrame.
If it's a Copy: Modifying the copy will NOT modify the original DataFrame.

'''

In [None]:
'''You want a COPY when:

You intend to make changes to the selected subset of data, and you do not want those changes to affect the original DataFrame. This is the most common scenario for data analysis and cleaning, where you create intermediate processed versions of your data.
You are performing an operation that Pandas knows will result in a fragmented or non-contiguous memory layout if it were a view, so it defaults to a copy for performance and consistency.
You are okay with (or even want) a VIEW when:

You are only reading data from the subset and don't intend to modify it.
You specifically want to make changes to a subset of the original DataFrame in-place and are aware that the original will be affected. This can be more memory-efficient if you don't need a separate copy.
How to Guarantee a Copy
Because the view/copy behavior can sometimes be tricky, the best practice when you definitely need a copy (which is most of the time you're extracting data for separate manipulation) is to explicitly use the .copy() method:
'''


In [6]:
import pandas as pd

data = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
df = pd.DataFrame(data)

# This will be a copy, regardless of internal Pandas heuristics
subset_copy_loc = df.loc[[0, 2], ['col1']].copy()
subset_copy_iloc = df.iloc[0:2, :].copy() # Even though slicing might be a view, .copy() forces it

print("Original df:\n", df)

subset_copy_loc.iloc[0, 0] = 999
subset_copy_iloc.iloc[0, 0] = 888

print("\nOriginal df after modifying copies (should be unchanged):\n", df)
print("\nsubset_copy_loc:\n", subset_copy_loc)
print("\nsubset_copy_iloc:\n", subset_copy_iloc)

Original df:
    col1  col2
0     1     4
1     2     5
2     3     6

Original df after modifying copies (should be unchanged):
    col1  col2
0     1     4
1     2     5
2     3     6

subset_copy_loc:
    col1
0   999
2     3

subset_copy_iloc:
    col1  col2
0   888     4
1     2     5


In [None]:
'''

As you can see, the df remains unchanged because we explicitly used .copy().

Summary:
iloc and loc (with slicing): Tend to return views, but not always guaranteed.
iloc and loc (with fancy indexing/boolean indexing): Tend to return copies.
When in doubt, or when you explicitly need to modify the subset without 
affecting the original, use .copy() after your selection. 
This is the safest and clearest approach to ensure you're working with independent data.

'''

In [8]:
# setting index by a feature of interest:


import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# Set the 'Name' column as the new index
df_by_name = df_titanic.set_index('Name')

print("\n--- Original df_titanic Head (Default Index) ---")
print(df_titanic.head())

print("\n--- df_by_name Head (Name as Index) ---")
print(df_by_name.head())

print(f"\nType of df_by_name index: {type(df_by_name.index)}")
print(f"Name of df_by_name index: {df_by_name.index.name}")


Titanic dataset loaded successfully!

--- Original df_titanic Head (Default Index) ---
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            1

In [None]:
'''
What df_titanic.set_index('Name') does:

Creates a New DataFrame: By default, set_index() returns a new DataFrame (df_by_name in this case) 
and does not modify the original df_titanic in place. If you wanted to modify df_titanic directly, 
you would use df_titanic.set_index('Name', inplace=True).

Sets the Index: The values from the specified column ('Name') are removed from their column position 
and are used to create a new index for the DataFrame.

Unique Labels: Each passenger's name now acts as a unique label for their corresponding row. 
This makes it very intuitive to look up information for a specific person directly by their name.


'''

In [None]:
'''

Why this is useful:
Label-based Lookup: You can now use .loc[] to directly access rows by passenger names, 
which is often more intuitive than remembering their original integer row number.

'''

# Example: Get data for 'Braund, Mr. Owen Harris' using the new index
print(df_by_name.loc['Braund, Mr. Owen Harris'])

PassengerId            1
Survived               0
Pclass                 3
Sex                 male
Age                 22.0
SibSp                  1
Parch                  0
Ticket         A/5 21171
Fare                7.25
Cabin                NaN
Embarked               S
Name: Braund, Mr. Owen Harris, dtype: object


In [None]:
'''
Meaningful Access: For datasets where a specific column naturally serves as an identifier, 
setting it as the index provides a more semantic way to interact with your data.

Joining/Merging: When combining DataFrames, having common columns as indices can often simplify
 merge operations.

Now that df_by_name has 'Name' as its index, you can perform label-based lookups much more 
easily for individual passengers!

'''

In [None]:
# .reset_index()

# How to reverse the set_index() operation, bringing the column that was promoted to the index back into
# the DataFrame as a regular data column, and reverting to the default numerical (positional) index.

In [11]:
import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# 1. Create df_by_name with 'Name' as the index (as you did previously)
df_by_name = df_titanic.set_index('Name')

print("\n--- df_by_name (Name as Index) Head ---")
print(df_by_name.head())
print(f"df_by_name index type: {type(df_by_name.index)}")
print(f"Is 'Name' a column in df_by_name? {'Name' in df_by_name.columns}")

# 2. Convert 'Name' back to a data column and reset to default integer index
# By default, reset_index() will convert the current index into a column
# and create a new default integer index (0, 1, 2, ...)
df_reset = df_by_name.reset_index()

print("\n--- df_reset (After reset_index() - Name is now a column) Head ---")
print(df_reset.head())
print(f"df_reset index type: {type(df_reset.index)}")
print(f"Is 'Name' a column in df_reset? {'Name' in df_reset.columns}")

# You can also verify that df_reset is identical to the original df_titanic (except for potential memory addresses)
# print("\nAre df_titanic and df_reset identical (ignoring memory)?")
# print(df_titanic.equals(df_reset)) # This should return True if no other operations were done

Titanic dataset loaded successfully!

--- df_by_name (Name as Index) Head ---
                                                    PassengerId  Survived  \
Name                                                                        
Braund, Mr. Owen Harris                                       1         0   
Cumings, Mrs. John Bradley (Florence Briggs Tha...            2         1   
Heikkinen, Miss. Laina                                        3         1   
Futrelle, Mrs. Jacques Heath (Lily May Peel)                  4         1   
Allen, Mr. William Henry                                      5         0   

                                                    Pclass     Sex   Age  \
Name                                                                       
Braund, Mr. Owen Harris                                  3    male  22.0   
Cumings, Mrs. John Bradley (Florence Briggs Tha...       1  female  38.0   
Heikkinen, Miss. Laina                                   3  female  26.0   
Fu

In [12]:
# .reindex
# ===============
import pandas as pd

# --- 1. Define Sales Data for January and February ---

# January Sales Data: Stores A, B, C
jan_sales_data = {
    'Store A': 1500,
    'Store B': 2000,
    'Store C': 1800
}
s_jan = pd.Series(jan_sales_data, name='January Sales')

print("--- January Sales ---")
print(s_jan)
print("-" * 30)

# February Sales Data: Store B was closed, so no sales entry for it
feb_sales_data = {
    'Store A': 1700,
    'Store C': 1950
}
s_feb = pd.Series(feb_sales_data, name='February Sales')

print("\n--- February Sales (Store B is missing) ---")
print(s_feb)
print("-" * 30)

# --- 2. Define a Common Index ---
# This index includes all stores present across both months
common_index = ['Store A', 'Store B', 'Store C']

# --- 3. Reindex February Sales to be Compatible with January Sales ---
# We use reindex() to align s_feb with the common_index.
# For any index label in common_index that is NOT in s_feb (like 'Store B'),
# fill its value with 0, because the store was closed.
s_feb_reindexed = s_feb.reindex(common_index, fill_value=0)

print(f"\n--- February Sales Reindexed (using common_index and fill_value=0) ---")
print(s_feb_reindexed)
print("-" * 30)

# --- 4. Find the Difference in Sales Accurately Between Two Months ---
# Now that both Series have the same index and missing values are handled (filled with 0),
# we can perform direct element-wise subtraction.
sales_difference = s_feb_reindexed - s_jan

print(f"\n--- Sales Difference (February - January) ---")
print(sales_difference)
print("-" * 30)

print("\n--- Interpretation of Results ---")
print(f"Store A: Increased sales by ${sales_difference['Store A']:.2f}")
print(f"Store B: Sales decreased by ${abs(sales_difference['Store B']):.2f} (due to closure)")
print(f"Store C: Increased sales by ${sales_difference['Store C']:.2f}")

--- January Sales ---
Store A    1500
Store B    2000
Store C    1800
Name: January Sales, dtype: int64
------------------------------

--- February Sales (Store B is missing) ---
Store A    1700
Store C    1950
Name: February Sales, dtype: int64
------------------------------

--- February Sales Reindexed (using common_index and fill_value=0) ---
Store A    1700
Store B       0
Store C    1950
Name: February Sales, dtype: int64
------------------------------

--- Sales Difference (February - January) ---
Store A     200
Store B   -2000
Store C     150
dtype: int64
------------------------------

--- Interpretation of Results ---
Store A: Increased sales by $200.00
Store B: Sales decreased by $2000.00 (due to closure)
Store C: Increased sales by $150.00


In [None]:
'''

Explanation:
Initial Series (s_jan, s_feb): Notice that s_feb naturally doesn't have an entry for 
'Store B' because it was closed.
Attempting Direct Subtraction (Without reindex): If you tried s_feb - s_jan directly, 
Pandas would align them by index. For 'Store B', since it exists in s_jan but not s_feb, 
it would result in NaN in the difference, which isn't what we want for "closed" sales (we want 0).

'''

In [None]:
# What would happen without reindex() for Store B:
# print(s_feb - s_jan)
# Store A     200.0
# Store B     NaN  <-- Problem!
# Store C     150.0
# dtype: float64

In [None]:
'''
common_index: We explicitly define all the labels we expect to see in our final aligned data.


s_feb.reindex(common_index, fill_value=0):
This creates a new Series based on s_feb.
It aligns s_feb to common_index.
For 'Store B', which is in common_index but not in s_feb, reindex inserts this label and
assigns it the fill_value of 0. This accurately represents "zero sales" for the closed store.
Existing labels ('Store A', 'Store C') retain their values.


sales_difference = s_feb_reindexed - s_jan: Now, both s_feb_reindexed and s_jan have the exact same index (['Store A', 'Store B', 'Store C']), allowing for a correct element-wise subtraction that accounts for Store B's closure as zero sales.
This example clearly shows how reindex() with fill_value is essential for performing accurate calculations across datasets that might have misaligned or missing entries for certain labels.

'''

In [None]:
#  what if January has store A, B and C but Feb has stores A, C and D.

'''

Okay, this is a more complex and very realistic scenario for data alignment! When 
neither dataset contains all the unique labels present across both, you need to reindex 
both Series (or DataFrames) to a comprehensive common index.

Here's how to handle it:


'''

In [13]:
import pandas as pd

# --- 1. Define Sales Data for January and February ---

# January Sales Data: Stores A, B, C
jan_sales_data = {
    'Store A': 1500,
    'Store B': 2000,
    'Store C': 1800
}
s_jan = pd.Series(jan_sales_data, name='January Sales')

print("--- January Sales ---")
print(s_jan)
print("-" * 30)

# February Sales Data: Stores A, C, D (Store B is gone, Store D is new)
feb_sales_data = {
    'Store A': 1700,
    'Store C': 1950,
    'Store D': 1200 # New store D
}
s_feb = pd.Series(feb_sales_data, name='February Sales')

print("\n--- February Sales (Store B is missing, Store D is new) ---")
print(s_feb)
print("-" * 30)

# --- 2. Determine the Common Index ---
# The common index should be the union of all store names from both months.
# We can get this by taking the union of their indices.
common_index = s_jan.index.union(s_feb.index)

print(f"\n--- Common Index for both months ---")
print(common_index.tolist()) # Convert to list for cleaner printing
print("-" * 30)

# --- 3. Reindex BOTH January and February Sales to the Common Index ---
# For any store that didn't exist in a given month, its sales should be 0.

s_jan_reindexed = s_jan.reindex(common_index, fill_value=0)
s_feb_reindexed = s_feb.reindex(common_index, fill_value=0)

print(f"\n--- January Sales Reindexed (with Store D filled as 0) ---")
print(s_jan_reindexed)
print("-" * 30)

print(f"\n--- February Sales Reindexed (with Store B filled as 0) ---")
print(s_feb_reindexed)
print("-" * 30)

# --- 4. Find the Difference in Sales Accurately Between Two Months ---
# Now that both Series are perfectly aligned with the same index and 0s for missing data,
# we can perform direct element-wise subtraction.
sales_difference = s_feb_reindexed - s_jan_reindexed

print(f"\n--- Sales Difference (February - January) ---")
print(sales_difference)
print("-" * 30)

print("\n--- Interpretation of Results ---")
print(f"Store A: Increased sales by ${sales_difference['Store A']:.2f}")
print(f"Store B: Sales decreased by ${abs(sales_difference['Store B']):.2f} (closed in Feb)")
print(f"Store C: Increased sales by ${sales_difference['Store C']:.2f}")
print(f"Store D: Increased sales by ${sales_difference['Store D']:.2f} (new in Feb)")

--- January Sales ---
Store A    1500
Store B    2000
Store C    1800
Name: January Sales, dtype: int64
------------------------------

--- February Sales (Store B is missing, Store D is new) ---
Store A    1700
Store C    1950
Store D    1200
Name: February Sales, dtype: int64
------------------------------

--- Common Index for both months ---
['Store A', 'Store B', 'Store C', 'Store D']
------------------------------

--- January Sales Reindexed (with Store D filled as 0) ---
Store A    1500
Store B    2000
Store C    1800
Store D       0
Name: January Sales, dtype: int64
------------------------------

--- February Sales Reindexed (with Store B filled as 0) ---
Store A    1700
Store B       0
Store C    1950
Store D    1200
Name: February Sales, dtype: int64
------------------------------

--- Sales Difference (February - January) ---
Store A     200
Store B   -2000
Store C     150
Store D    1200
dtype: int64
------------------------------

--- Interpretation of Results ---
Store 

In [None]:
'''

Explanation of Changes:

Modified s_feb: Now it explicitly includes 'Store D' and implicitly lacks 'Store B'.

common_index = s_jan.index.union(s_feb.index): This is the crucial part.
s_jan.index gives Index(['Store A', 'Store B', 'Store C'], dtype='object').
s_feb.index gives Index(['Store A', 'Store C', 'Store D'], dtype='object').

The .union() method on these indexes efficiently computes the unique labels that are present in 
either index, resulting in Index(['Store A', 'Store B', 'Store C', 'Store D'], dtype='object'). 
This creates our comprehensive common index.

Reindex Both Series: We apply reindex(common_index, fill_value=0) to both s_jan and s_feb.
s_jan_reindexed now has 'Store D' with a value of 0 (since it had no sales in Jan).
s_feb_reindexed now has 'Store B' with a value of 0 (since it had no sales in Feb).

Accurate Subtraction: With both Series now perfectly aligned and all relevant "missing" data 
explicitly represented as 0s, the subtraction s_feb_reindexed - s_jan_reindexed gives 
accurate differences for all stores, including those that appeared or disappeared.


'''

In [None]:
# Reindexing with Columns:
########################################


'''
Certainly! Reindexing isn't just for rows; it's equally powerful for aligning columns in DataFrames.
 This is particularly useful when you have different sets of columns in various DataFrames and 
 want to combine or compare them consistently.

Let's imagine you have product data where some products have 'Price' and 'Stock' information,
 and another DataFrame (perhaps from a different source) has similar data but with a different 
 set of columns, or the columns are in a different order.

Here's an example demonstrating reindex() with columns:

Reindexing with Columns
Imagine you have two DataFrames representing product data. df_current_month has 'Price' 
and 'Stock' for various products, but df_last_month only has 'Price' and introduces a 'Discount'
column that isn't in the current month's data. Our goal is to align the columns of df_last_month 
to match df_current_month, filling in missing 'Stock' values with 0.

'''

import pandas as pd

# --- 1. Define Product DataFrames ---

# Current month's product data
data_current = {
    'ProductID': ['P101', 'P102', 'P103', 'P104'],
    'Price': [25.50, 12.00, 75.00, 5.25],
    'Stock': [100, 50, 20, 200]
}
df_current_month = pd.DataFrame(data_current).set_index('ProductID')

print("--- df_current_month (Current Month's Product Data) ---")
print(df_current_month)
print("-" * 50)

# Last month's product data (missing 'Stock', has 'Discount')
data_last = {
    'ProductID': ['P101', 'P102', 'P103', 'P105'], # P105 is new, P104 is missing
    'Price': [24.99, 11.50, 72.00, 4.99],
    'Discount': [0.05, 0.02, 0.10, 0.0]
}
df_last_month = pd.DataFrame(data_last).set_index('ProductID')

print("\n--- df_last_month (Last Month's Product Data - Different Columns) ---")
print(df_last_month)
print("-" * 50)

# --- 2. Define the Desired Common Columns ---
# We want df_last_month to have the same columns as df_current_month,
# in the same order.
desired_columns = ['Price', 'Stock']

# --- 3. Reindex df_last_month's Columns ---
# We'll reindex the columns of df_last_month to match `desired_columns`.
# For any column in `desired_columns` that is NOT in df_last_month (like 'Stock'),
# we'll fill its values with 0, as there was no stock recorded last month in this view.
# Notice `axis=1` is specified to indicate column reindexing.
df_last_month_reindexed_cols = df_last_month.reindex(columns=desired_columns, fill_value=0)

print(f"\n--- df_last_month after reindexing columns to '{desired_columns}' ---")
print(df_last_month_reindexed_cols)
print("-" * 50)

# --- 4. Align Rows and Then Calculate Difference (Example) ---
# Now that columns are compatible, you might want to align rows (ProductID) too
# before doing calculations. Let's get a union of all ProductIDs.
all_product_ids = df_current_month.index.union(df_last_month.index)

df_current_aligned = df_current_month.reindex(all_product_ids, fill_value=0)
df_last_aligned = df_last_month_reindexed_cols.reindex(all_product_ids, fill_value=0) # Use the col-reindexed one

print(f"\n--- df_current_month aligned by all ProductIDs ---")
print(df_current_aligned)

print(f"\n--- df_last_month (after both column and row reindexing) ---")
print(df_last_aligned)
print("-" * 50)

# Now we can accurately calculate the change in price or stock
price_change = df_current_aligned['Price'] - df_last_aligned['Price']
stock_change = df_current_aligned['Stock'] - df_last_aligned['Stock']

print(f"\n--- Price Change (Current Month - Last Month) ---")
print(price_change)

print(f"\n--- Stock Change (Current Month - Last Month) ---")
print(stock_change)

'''

Explanation:


Initial DataFrames:
df_current_month has Price and Stock columns.
df_last_month has Price and Discount columns (no Stock).
desired_columns: We define the precise list of column names (and their order) 
that we want our target DataFrame (df_last_month) to have.

df_last_month.reindex(columns=desired_columns, fill_value=0):
columns=desired_columns: This is the key part that tells reindex to operate on the columns
 (instead of the default rows).

fill_value=0: For any column in desired_columns that was not present in df_last_month
 (i.e., 'Stock'), Pandas creates that column and fills all its values with 0.
Any columns in df_last_month that are not in desired_columns (i.e., 'Discount') 
are dropped by default.

Full Alignment for Calculation: After reindexing the columns, I added a step to also
 reindex the rows (ProductID) of both DataFrames to a common set of IDs. This ensures 
 that when you perform an arithmetic operation like subtraction, Pandas can perfectly 
 align both the rows and columns, giving you accurate changes.
This example clearly shows how reindex() with the columns parameter provides precise 
control over DataFrame column structure, making disparate datasets compatible for analysis.


'''

In [None]:
# iterrows
#################


'''

The df.iterrows() method in Pandas is a generator that iterates over the rows of a DataFrame as (index, Series) pairs. For each row, it returns the row's index and a Pandas Series containing the row's data.

When to use iterrows()
iterrows() is useful when you need to process each row of a DataFrame individually and access both its index and its data.

Important Considerations (and why it's often not recommended)
While iterrows() seems intuitive, it's generally not the most performant or "Pandas-idiomatic" way to iterate through rows, especially for large DataFrames. Here's why:

Returns a Series for each row: Creating a Series for each row can be computationally expensive compared to other methods.
Type conversion: The Series returned for each row might not retain the original dtype of the DataFrame columns. Pandas tries to infer the best common dtype for the entire row, which can lead to unexpected type changes (e.g., integers becoming floats if there's a NaN in a column).
Performance: For large DataFrames, explicit Python loops with iterrows() are significantly slower than vectorized Pandas operations.
Rule of Thumb: If you find yourself using iterrows() to perform calculations or manipulations across rows, first consider if there's a vectorized Pandas operation (e.g., column arithmetic, apply(), groupby(), agg(), transform()) that can achieve the same result more efficiently.

Example Usage
Let's use the Titanic dataset to demonstrate iterrows().

'''

import pandas as pd

# Load the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print("\n--- First 5 rows of Titanic DataFrame ---")
    print(df_titanic.head())
    print("-" * 50)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")


print("\n--- Iterating through the first 5 rows with df.iterrows() ---")
# Use .head() to limit the iteration for demonstration purposes
for index, row in df_titanic.head().iterrows():
    print(f"Index: {index}")
    print(f"  Name: {row['Name']}")
    print(f"  Age: {row['Age']}")
    print(f"  Survived: {row['Survived']}")
    print(f"  Row (Series):\n{row.head()}") # Displaying part of the row Series
    print("-" * 20)

print("\n--- Example: Using iterrows() to create a new column (for demonstration ONLY, not recommended for large datasets) ---")

# Let's say we want to create a simple 'Age_Group' column
# This is a bad way to do it for performance, but good for showing iterrows()
age_groups = []
for index, row in df_titanic.head(10).iterrows(): # Limiting to 10 rows for example
    if pd.isna(row['Age']):
        age_groups.append("Unknown")
    elif row['Age'] < 18:
        age_groups.append("Child")
    elif row['Age'] >= 18 and row['Age'] < 60:
        age_groups.append("Adult")
    else:
        age_groups.append("Senior")

# Create a new Series from the list and assign it
# In a real scenario, you'd assign it to df_titanic['Age_Group'] directly
df_temp = df_titanic.head(10).copy() # Create a copy to modify
df_temp['Age_Group_Iterrows'] = age_groups
print(df_temp[['Name', 'Age', 'Age_Group_Iterrows']])

print("\n--- Preferred (Vectorized) Way to Create 'Age_Group' ---")
# This is how you would typically do the above operation efficiently
def assign_age_group(age):
    if pd.isna(age):
        return "Unknown"
    elif age < 18:
        return "Child"
    elif age >= 18 and age < 60:
        return "Adult"
    else:
        return "Senior"

df_titanic['Age_Group_Vectorized'] = df_titanic['Age'].apply(assign_age_group)
print(df_titanic[['Name', 'Age', 'Age_Group_Vectorized']].head(10)) # Display first 10 rows

'''
When iterrows() might be acceptable:
Small DataFrames: If your DataFrame has only a few hundred or a few thousand rows, the performance penalty might be negligible, and the clarity of a simple loop can be fine.
Complex Row-wise Logic: When the logic for each row is genuinely complex and doesn't map easily to vectorized operations or apply() (e.g., depending on multiple values from previous rows in a non-trivial way, or involving external API calls per row).
Debugging: It can be useful for debugging or inspecting specific rows individually.
For most common data manipulation tasks, especially on larger datasets, explore vectorized operations (df['col'] * 2), .apply() (when a Python function needs to be applied element-wise or row-wise), or groupby() before resorting to iterrows().


'''

In [None]:
# itertuples
##############


'''

The df.itertuples() method in Pandas is another way to iterate over the rows of a DataFrame, similar to iterrows(), but it is generally much faster and more efficient, especially for large DataFrames.

How itertuples() Works
itertuples() iterates over the rows of a DataFrame and returns them as named tuples. Each named tuple behaves much like a regular Python tuple but allows you to access elements by attribute name (column name) as well as by numerical position.

Advantages of itertuples() over iterrows()
Performance: itertuples() is significantly faster than iterrows() because it doesn't create a Pandas Series object for each row, which is a relatively expensive operation.
Type Preservation: Named tuples generally preserve the data types of the original DataFrame columns better than the Series objects returned by iterrows().
Attribute Access: You can access column values using dot notation (e.g., row.Age) which can make your code more readable than dictionary-style access (row['Age']).
Syntax
df.itertuples(index=True, name='Pandas')

index (bool, default True): If True, the first element of the tuple will be the index of the row. If False, the index is omitted.
name (str, default 'Pandas'): The name of the named tuple. You can change this to something more descriptive for your specific data.
Example Usage
Let's use the Titanic dataset to demonstrate itertuples().

'''

import pandas as pd

# Load the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print("\n--- First 5 rows of Titanic DataFrame ---")
    print(df_titanic.head())
    print("-" * 50)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")


print("\n--- Iterating through the first 5 rows with df.itertuples() ---")
# Use .head() to limit the iteration for demonstration purposes
for row in df_titanic.head().itertuples():
    print(f"Row Tuple: {row}")
    print(f"  Index: {row.Index}") # Accessing the index
    print(f"  Name: {row.Name}")   # Accessing by column name attribute
    print(f"  Age: {row.Age}")     # Accessing by column name attribute
    print(f"  Survived: {row.Survived}")
    # You can also access by position, but attribute access is often preferred for readability
    # print(f"  PassengerId (pos 1): {row[1]}")
    print("-" * 20)


print("\n--- Example: Using itertuples() to calculate a new column (better performance than iterrows()) ---")

# Let's say we want to calculate a simplified 'Fare_Per_Person'
# This is still a loop, so vectorized methods are better if possible,
# but if you must loop, itertuples is often preferred over iterrows.
fare_per_person = []
for row in df_titanic.itertuples(index=False): # index=False if you don't need the DataFrame index in the tuple
    # Check if 'Fare' and 'SibSp' (Siblings/Spouses) and 'Parch' (Parents/Children) are not NaN
    if not pd.isna(row.Fare) and not pd.isna(row.SibSp) and not pd.isna(row.Parch):
        # Avoid division by zero if (SibSp + Parch + 1) is 0
        num_people_in_group = row.SibSp + row.Parch + 1
        if num_people_in_group > 0:
            fare_per_person.append(row.Fare / num_people_in_group)
        else: # Should not happen if SibSp + Parch is always >= 0 for individuals
            fare_per_person.append(0) # Or np.nan
    else:
        fare_per_person.append(np.nan) # Or 0, depending on how you want to handle missing values

# Assign to a new column (this assumes fare_per_person has same length as df_titanic)
df_titanic['Fare_Per_Person_Itertuples'] = fare_per_person
print(df_titanic[['Name', 'Fare', 'SibSp', 'Parch', 'Fare_Per_Person_Itertuples']].head(10))

print("\n--- Preferred (Vectorized) Way to Calculate 'Fare_Per_Person' ---")
# This is how you would typically do the above operation efficiently using vectorized operations
df_titanic['Num_People_In_Group'] = df_titanic['SibSp'] + df_titanic['Parch'] + 1
df_titanic['Fare_Per_Person_Vectorized'] = df_titanic['Fare'] / df_titanic['Num_People_In_Group']

print(df_titanic[['Name', 'Fare', 'SibSp', 'Parch', 'Fare_Per_Person_Vectorized']].head(10))



'''

When to Use itertuples():
Looping is unavoidable: If your row-wise logic is too complex for vectorized operations or apply() (e.g., depends on state from previous rows, external lookups per row, highly conditional logic across many columns).
Performance is crucial: When you need to iterate, and iterrows() is too slow.
Attribute access is preferred: When you like accessing columns by row.column_name.
Preserving dtypes: When type preservation is important in your loop.
Best Practice Reminder: Always try to use vectorized Pandas operations first, as they are almost always the fastest. If you must loop, itertuples() is generally a better choice than iterrows() for performance and type integrity.


'''

In [None]:
# Do we have itercolumns in pandas?

'''

Pandas does not have a direct df.itercolumns() method in the same way it has df.iterrows() or df.itertuples().

However, there are several very straightforward and idiomatic ways to iterate through columns (or access columns) in a Pandas DataFrame:

1. Iterating through Column Names (Most Common)
The simplest way to iterate through columns is to iterate directly over the DataFrame's columns attribute, which is an Index object containing all column names. You can then use the column name to access the corresponding Series.

Python

import pandas as pd

data = {'col1': [1, 2, 3], 'col2': [4, 5, 6], 'col3': [7, 8, 9]}
df = pd.DataFrame(data)

print("--- Iterating through column names ---")
for col_name in df.columns:
    print(f"Column Name: {col_name}")
    print(f"Column Data (Series):\n{df[col_name].head()}") # Access the column Series
    print("-" * 20)
2. Iterating through (Column Name, Series) Pairs (df.items() / df.iteritems())
This method is the direct equivalent of df.iterrows() but for columns. It yields (column_name, Series) pairs. df.items() is preferred in modern Pandas (since 1.0.0), while df.iteritems() is deprecated.

Python

import pandas as pd

data = {'col1': [1, 2, 3], 'col2': [4, 5, 6], 'col3': [7, 8, 9]}
df = pd.DataFrame(data)

print("\n--- Iterating through (column name, Series) pairs with df.items() ---")
for col_name, col_data in df.items():
    print(f"Column Name: {col_name}")
    print(f"Column Data (Series):\n{col_data.head()}")
    print("-" * 20)
3. Using .apply() with axis=1 (for row-wise operations) or axis=0 (for column-wise operations)
While not direct iteration in the sense of a Python for loop, .apply() is a powerful way to perform operations on columns (or rows).

axis=0 (default): Applies a function column-wise (i.e., the function receives each column as a Series).
axis=1: Applies a function row-wise (i.e., the function receives each row as a Series).
Python

import pandas as pd
import numpy as np

data = {'A': [10, 20, 30], 'B': [1, 2, 3], 'C': [5, 15, 25]}
df = pd.DataFrame(data)

print("\n--- Using .apply(axis=0) to perform column-wise operations ---")

# Example: Calculate the mean of each column
column_means = df.apply(np.mean, axis=0)
print("Mean of each column:\n", column_means)

# Example: Find the max value in each column
column_maxes = df.apply(lambda x: x.max(), axis=0)
print("\nMax value in each column:\n", column_maxes)
Why no itercolumns()?
The reason Pandas doesn't have a dedicated itercolumns() method is likely because iterating through column names (df.columns) and then accessing the columns (df[col_name]) is already very explicit, clear, and efficient. Also, df.items() directly provides the (column_name, Series) pairs, covering the common need for such iteration.

As with rows, it's generally best practice to use vectorized operations (operations directly on entire Pandas Series or DataFrames) whenever possible for performance, rather than explicit Python loops, even when iterating through columns.


'''

In [None]:
# Feature creation with iterrows - append function and also with apply - lambda functions. 

'''

Let's create a sample dataset and then demonstrate feature creation using 
both the iterrows() (less efficient but illustrative) and the apply() with lambda (more efficient and idiomatic) methods.

'''
# 1. Create the Initial Dataset


import pandas as pd

# Create a DataFrame with names and ages
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 32, 19, 45, 28]
}
df = pd.DataFrame(data)

print("--- Original DataFrame ---")
print(df)
print("-" * 30)


# 2. Feature Creation using iterrows() and append() (Less Efficient)

'''
This method involves explicit looping through rows and is generally discouraged 
for large datasets due to performance overhead, as it operates row by row in Python.

'''

# Create an empty list to store the new feature values
categories_iterrows = []

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    if row['Age'] > 30:
        categories_iterrows.append('Senior')
    else:
        categories_iterrows.append('Junior')

# Add the new 'Category_Iterrows' column to the DataFrame
df['Category_Iterrows'] = categories_iterrows

print("\n--- DataFrame with 'Category_Iterrows' (using iterrows) ---")
print(df)
print("-" * 30)


'''
Explanation:

We create an empty list categories_iterrows.
df.iterrows() yields (index, row_Series) pairs.
For each row (which is a Pandas Series), we access its 'Age' value.
Based on the condition (row['Age'] > 30), we append the appropriate category ('Senior' or 'Junior') to our list.
Finally, we assign the populated list to a new column 'Category_Iterrows'. Pandas automatically aligns the list values to the DataFrame's index.
3. Feature Creation using lambda and apply() (More Efficient and Idiomatic)
This is the preferred Pandas way for element-wise or row-wise transformations. apply() is optimized and often uses C-level implementations under the hood, making it much faster for large datasets.

'''

# Define a lambda function that takes an age and returns the category
# lambda x: ... means x will represent each element of the Series 'df['Age']'
classify_age = lambda age: 'Senior' if age > 30 else 'Junior'

# Apply the lambda function to the 'Age' column to create the new 'Category_Apply' column
df['Category_Apply'] = df['Age'].apply(classify_age)

print("\n--- DataFrame with 'Category_Apply' (using lambda and apply) ---")
print(df)
print("-" * 30)

''''
Explanation:

We define classify_age as a lambda function. A lambda function is a small, anonymous function. Here, it takes one argument (age) and returns 'Senior' if age is greater than 30, otherwise 'Junior'.
df['Age'].apply(classify_age) tells Pandas to take the 'Age' Series, pass each individual value from that Series to our classify_age function, and collect all the returned results into a new Series.
This new Series is then directly assigned to the new column 'Category_Apply'.


Comparison:
iterrows(): More readable for complex, multi-column row-by-row logic, but slow. Avoid for simple transformations.
apply() with lambda: Excellent balance of readability and performance for row-wise or element-wise operations. It's generally preferred over iterrows() for feature creation.
Vectorized Operations (Even Better!): For very simple numeric operations, direct vectorized operations are the fastest. For this specific 'Senior'/'Junior' classification, you could even do it like this (most efficient):


# Most efficient way for this specific logic
df['Category_Vectorized'] = 'Junior' # Default all to Junior
df.loc[df['Age'] > 30, 'Category_Vectorized'] = 'Senior' # Overwrite where condition is met
print("\n--- DataFrame with 'Category_Vectorized' (most efficient) ---")
print(df)
In summary, always aim for vectorized solutions first, then apply() with lambda for more complex row-wise logic, and only resort to iterrows() as a last resort or for very small datasets or debugging.

'''

In [15]:
# df.sort_values() method


'''
This is a very common and useful operation for exploring data.

df.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key=None)
Key Parameters:

by (required): The name or list of names of the columns to sort by. This is the primary parameter.
ascending (bool or list of bool, default True): Sort ascending vs. descending.
If True, sorts in ascending order.
If False, sorts in descending order.
If a list of booleans, it must match the length of by, specifying the order for each column.
inplace (bool, default False): If True, the operation is performed in place (modifies the original DataFrame) and returns None. If False, it returns a new sorted DataFrame.
na_position (str, default 'last'): Controls the position of NaN (missing) values.
'first': Puts NaNs at the beginning.
'last': Puts NaNs at the end.

'''

import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print("\n--- Original DataFrame Head ---")
    print(df_titanic[['Name', 'Age', 'Fare', 'Pclass']].head(10)) # Show relevant columns
    print("-" * 60)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# --- 1. Sorting by a Single Column (Age, ascending) ---
# Creates a new DataFrame with rows sorted by 'Age'
df_sorted_by_age = df_titanic.sort_values(by='Age')

print("\n--- Sorted by Age (Ascending) ---")
print(df_sorted_by_age[['Name', 'Age', 'Fare', 'Pclass']].head(10)) # Showing first 10 rows
print("-" * 60)

# --- 2. Sorting by a Single Column (Fare, descending) ---
df_sorted_by_fare_desc = df_titanic.sort_values(by='Fare', ascending=False)

print("\n--- Sorted by Fare (Descending) ---")
print(df_sorted_by_fare_desc[['Name', 'Age', 'Fare', 'Pclass']].head(10))
print("-" * 60)

# --- 3. Sorting by Multiple Columns (Pclass ascending, then Age descending) ---
# When sorting by multiple columns, the order matters.
# It sorts by the first column, then for rows with identical values in the first column,
# it sorts by the second column, and so on.
df_sorted_multi = df_titanic.sort_values(by=['Pclass', 'Age'], ascending=[True, False])

print("\n--- Sorted by Pclass (Asc) then Age (Desc) ---")
print(df_sorted_multi[['Name', 'Age', 'Fare', 'Pclass']].head(10))
print("-" * 60)

# --- 4. Handling Missing Values ('na_position') ---
# 'Age' column has missing values (NaN). By default, they go to the last.
# Let's sort by Age and put NaNs at the beginning.
df_sorted_age_nan_first = df_titanic.sort_values(by='Age', na_position='first')

print("\n--- Sorted by Age (NaNs First) ---")
print(df_sorted_age_nan_first[['Name', 'Age', 'Fare', 'Pclass']].head(10)) # Will show NaNs at the top
print("-" * 60)

# --- 5. Sorting In-place (Modifying the original DataFrame) ---
# Use with caution, as it permanently changes df_titanic
print("\n--- Sorting In-place (df_titanic modified directly) ---")
print("df_titanic head BEFORE in-place sort:")
print(df_titanic[['Name', 'Age']].head())

df_titanic.sort_values(by='PassengerId', inplace=True) # Sort by original ID to reset order
# (or any column that was not sorted before to show modification)

print("\ndf_titanic head AFTER in-place sort by PassengerId:")
print(df_titanic[['Name', 'Age']].head())
print("-" * 60)



Titanic dataset loaded successfully!

--- Original DataFrame Head ---
                                                Name   Age     Fare  Pclass
0                            Braund, Mr. Owen Harris  22.0   7.2500       3
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0  71.2833       1
2                             Heikkinen, Miss. Laina  26.0   7.9250       3
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0  53.1000       1
4                           Allen, Mr. William Henry  35.0   8.0500       3
5                                   Moran, Mr. James   NaN   8.4583       3
6                            McCarthy, Mr. Timothy J  54.0  51.8625       1
7                     Palsson, Master. Gosta Leonard   2.0  21.0750       3
8  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  27.0  11.1333       3
9                Nasser, Mrs. Nicholas (Adele Achem)  14.0  30.0708       2
------------------------------------------------------------

--- Sorted by Age (Ascending) --

In [16]:
'''
Filling NaN values before sorting is a common step in data cleaning and preparation, as NaN values are treated specially by sorting algorithms (typically moved to the beginning or end).

Let's fill the NaN values in the 'Age' column of df_titanic with 0 and then sort it.

'''

import pandas as pd
import numpy as np # Often useful for NaN checks

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# --- 1. Inspect 'Age' column before filling NaNs ---
print("\n--- 'Age' Column Info Before Filling NaNs ---")
print(df_titanic['Age'].head(10))
print(f"Number of NaN values in 'Age' before fill: {df_titanic['Age'].isnull().sum()}")
print("-" * 60)

# --- 2. Fill NaN values in 'Age' with 0 ---
# Create a copy to avoid modifying the original df_titanic for further operations
df_filled_age = df_titanic.copy()
df_filled_age['Age'] = df_filled_age['Age'].fillna(0) # Or df_filled_age['Age'].fillna(0, inplace=True)

print("\n--- 'Age' Column Info After Filling NaNs with 0 ---")
print(df_filled_age['Age'].head(10))
print(f"Number of NaN values in 'Age' after fill: {df_filled_age['Age'].isnull().sum()}")
print("-" * 60)

# --- 3. Sort the DataFrame by the 'Age' column (ascending) ---
# Now, since NaNs are 0, they will appear at the very beginning of the sorted DataFrame.
df_sorted_age_filled = df_filled_age.sort_values(by='Age', ascending=True)

print("\n--- DataFrame Sorted by 'Age' (Ascending, NaNs filled with 0) ---")
# Displaying the first few rows to show the 0-age entries
print(df_sorted_age_filled[['Name', 'Age', 'Fare', 'Pclass']].head(10))
print("-" * 60)

# --- 4. Verify some rows with 0 age (which were previously NaN) ---
print("\n--- Some Rows with Age = 0 (previously NaN) ---")
print(df_sorted_age_filled[df_sorted_age_filled['Age'] == 0][['Name', 'Age', 'Fare', 'Pclass']].head(10))
print("-" * 60)

'''
Explanation:
Original 'Age' Column: We first show the head() of the original 'Age' column and count its NaN values using isnull().sum(). You'll see NaN entries.
df_filled_age['Age'].fillna(0): This is the core operation.
df_filled_age['Age'] selects the 'Age' Series.
.fillna(0) replaces all NaN values within that Series with the integer 0.
We assign the result back to df_filled_age['Age'] (or use inplace=True) to update the column.
Sorted DataFrame: After filling, when we sort df_filled_age.sort_values(by='Age'), all the rows where 'Age' was previously NaN (and now 0) will appear at the very top of the sorted DataFrame because 0 is the smallest numerical value.
This demonstrates how fillna() is crucial for preparing data when numerical operations or sorting are affected by missing values.

'''

Titanic dataset loaded successfully!

--- 'Age' Column Info Before Filling NaNs ---
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64
Number of NaN values in 'Age' before fill: 177
------------------------------------------------------------

--- 'Age' Column Info After Filling NaNs with 0 ---
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     0.0
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64
Number of NaN values in 'Age' after fill: 0
------------------------------------------------------------

--- DataFrame Sorted by 'Age' (Ascending, NaNs filled with 0) ---
                                  Name  Age     Fare  Pclass
28       O'Dwyer, Miss. Ellen "Nellie"  0.0   7.8792       3
593                 Bourke, Miss. Mary  0.0   7.7500       3
457  Kenyon, Mrs. Frederick R (Marion)  0.0  51.8625       1
454                Peduzzi, Mr. Joseph  0.0   8.0500       3
451    Hagland, Mr. Ingval

"\nExplanation:\nOriginal 'Age' Column: We first show the head() of the original 'Age' column and count its NaN values using isnull().sum(). You'll see NaN entries.\ndf_filled_age['Age'].fillna(0): This is the core operation.\ndf_filled_age['Age'] selects the 'Age' Series.\n.fillna(0) replaces all NaN values within that Series with the integer 0.\nWe assign the result back to df_filled_age['Age'] (or use inplace=True) to update the column.\nSorted DataFrame: After filling, when we sort df_filled_age.sort_values(by='Age'), all the rows where 'Age' was previously NaN (and now 0) will appear at the very top of the sorted DataFrame because 0 is the smallest numerical value.\nThis demonstrates how fillna() is crucial for preparing data when numerical operations or sorting are affected by missing values.\n\n"

In [17]:
# sorted_values, where by accepts a list of features


'''

This is a very common and powerful use case for sort_values() in Pandas. You can absolutely specify a list of columns for by and then a corresponding list of boolean values for ascending to control the sort order for each column individually.

Let's use the Titanic dataset to demonstrate this. We'll sort by 'Pclass' in ascending order (1st class first) and then, for passengers within the same 'Pclass', we'll sort by 'Fare' in descending order (most expensive fare first within that class).

Sorting by Multiple Columns with Mixed Order
'''

import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print("\n--- Original DataFrame Head (Relevant Columns) ---")
    print(df_titanic[['Name', 'Pclass', 'Fare', 'Age']].head(10)) # Show relevant columns
    print("-" * 60)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# Sort by 'Pclass' in ascending order, then by 'Fare' in descending order
# Notice the list of column names for 'by' and the corresponding list of booleans for 'ascending'
df_sorted_mixed = df_titanic.sort_values(by=['Pclass', 'Fare'], ascending=[True, False])

print("\n--- Sorted by Pclass (Ascending) then Fare (Descending) ---")
print(df_sorted_mixed[['Name', 'Pclass', 'Fare', 'Age']].head(15)) # Display more rows to see the effect
print("-" * 60)

print("\n--- Let's look at a specific Pclass (e.g., Pclass 1) to confirm Fare sorting ---")
# Filter for Pclass 1 and check the Fare column to verify descending order
pclass_1_sorted = df_sorted_mixed[df_sorted_mixed['Pclass'] == 1]
print(pclass_1_sorted[['Name', 'Pclass', 'Fare', 'Age']].head(10))
print("-" * 60)


''''
Explanation:
by=['Pclass', 'Fare']: We provide a list of column names. Pandas will first sort the entire DataFrame by the Pclass column.
ascending=[True, False]: This list directly corresponds to the by list.
True for Pclass means it will sort in ascending order (1st class passengers appear before 2nd class, which appear before 3rd class).
False for Fare means that within each Pclass group, passengers will be sorted by their Fare in descending order (most expensive fares first).
As you can see in the output, the DataFrame is first grouped by Pclass (1, then 2, then 3), and within each of those groups, the Fare values are arranged from highest to lowest. This level of control is incredibly useful for organizing your data exactly how you need it for analysis.

'''

Titanic dataset loaded successfully!

--- Original DataFrame Head (Relevant Columns) ---
                                                Name  Pclass     Fare   Age
0                            Braund, Mr. Owen Harris       3   7.2500  22.0
1  Cumings, Mrs. John Bradley (Florence Briggs Th...       1  71.2833  38.0
2                             Heikkinen, Miss. Laina       3   7.9250  26.0
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)       1  53.1000  35.0
4                           Allen, Mr. William Henry       3   8.0500  35.0
5                                   Moran, Mr. James       3   8.4583   NaN
6                            McCarthy, Mr. Timothy J       1  51.8625  54.0
7                     Palsson, Master. Gosta Leonard       3  21.0750   2.0
8  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)       3  11.1333  27.0
9                Nasser, Mrs. Nicholas (Adele Achem)       2  30.0708  14.0
------------------------------------------------------------

--- Sorted by

"'\nExplanation:\nby=['Pclass', 'Fare']: We provide a list of column names. Pandas will first sort the entire DataFrame by the Pclass column.\nascending=[True, False]: This list directly corresponds to the by list.\nTrue for Pclass means it will sort in ascending order (1st class passengers appear before 2nd class, which appear before 3rd class).\nFalse for Fare means that within each Pclass group, passengers will be sorted by their Fare in descending order (most expensive fares first).\nAs you can see in the output, the DataFrame is first grouped by Pclass (1, then 2, then 3), and within each of those groups, the Fare values are arranged from highest to lowest. This level of control is incredibly useful for organizing your data exactly how you need it for analysis.\n\n"

In [None]:


#df.sort_index()
########################

'''df.sort_index(), is a powerful Pandas method used to sort a DataFrame or Series by its index labels (row labels) or its column labels.

Unlike sort_values(), which sorts based on the data within columns, sort_index() specifically operates on the labels of the axes.

df.sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, ignore_index=False, key=None)
Key Parameters:

axis (int or str, default 0):
0 or 'index': Sorts by the row labels (index).
1 or 'columns': Sorts by the column labels (column names).
level (int or label or list, default None): For MultiIndex (hierarchical index), specifies which level(s) to sort by. If None, sorts by all levels.
ascending (bool or list of bool, default True): Sort ascending or descending.
inplace (bool, default False): If True, modifies the original DataFrame and returns None.
na_position (str, default 'last'): How to handle NaN values in the index (less common but possible).
Let's demonstrate with the Titanic dataset. First, we'll set the 'Name' column as the index so we have meaningful row labels to sort.

'''

import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# Set 'Name' as the index for better demonstration of row label sorting
df_by_name = df_titanic.set_index('Name')

print("--- Original df_by_name Head (Index not sorted alphabetically) ---")
print(df_by_name[['Age', 'Sex', 'Fare', 'Pclass']].head())
print("-" * 60)

# --- 1. Sorting by Row Index (default behavior) ---
# Sorts the DataFrame rows based on the alphabetical order of the 'Name' index
df_sorted_by_index_asc = df_by_name.sort_index(ascending=True)

print("\n--- Sorted by Row Index (Name), Ascending ---")
print(df_sorted_by_index_asc[['Age', 'Sex', 'Fare', 'Pclass']].head())
print("-" * 60)

# --- 2. Sorting by Row Index (descending) ---
df_sorted_by_index_desc = df_by_name.sort_index(ascending=False)

print("\n--- Sorted by Row Index (Name), Descending ---")
print(df_sorted_by_index_desc[['Age', 'Sex', 'Fare', 'Pclass']].head())
print("-" * 60)

# --- 3. Sorting by Column Index (Column Names) ---
# Creates a new DataFrame for this specific demo, with unsorted columns initially
df_unsorted_cols = df_titanic[['Name', 'Fare', 'Age', 'Pclass', 'Sex']].copy()

print("\n--- DataFrame with Unsorted Columns ---")
print(df_unsorted_cols.head())
print("Original column order:", df_unsorted_cols.columns.tolist())
print("-" * 60)

# Sort the DataFrame columns alphabetically by their names
df_sorted_cols = df_unsorted_cols.sort_index(axis=1) # axis=1 specifies sorting columns

print("\n--- DataFrame Sorted by Column Index (Column Names), Ascending ---")
print(df_sorted_cols.head())
print("New column order:", df_sorted_cols.columns.tolist())
print("-" * 60)

# --- 4. Sorting by Column Index (Descending) ---
df_sorted_cols_desc = df_unsorted_cols.sort_index(axis=1, ascending=False)

print("\n--- DataFrame Sorted by Column Index (Column Names), Descending ---")
print(df_sorted_cols_desc.head())
print("New column order:", df_sorted_cols_desc.columns.tolist())
print("-" * 60)

'''
When to use sort_index():
Organizing by Identifier: When your DataFrame's index is a meaningful identifier 
(like a unique ID, name, or date), sort_index() helps you arrange your data by that identifier.
Consistency: To ensure that DataFrames or Series are consistently ordered by their labels 
before performing operations like concatenation or merging.

Preparing for MultiIndex Operations: Crucial for efficient operations on MultiIndex 
DataFrames (e.g., groupby operations often perform better on sorted MultiIndexes).

Standardizing Column Order: To arrange your DataFrame's columns alphabetically or
 in a specific desired order for presentation or further processing.

'''

In [None]:
#sort_index() with axis=1:  (sorting df by column names)

'''

When you use sort_index() with axis=1, you are telling Pandas to sort the DataFrame's columns themselves based on their labels (the column names), rather than sorting the rows based on column values.

Here's a clear example:

Python

import pandas as pd

# Create a DataFrame with columns in an arbitrary order
data = {
    'Z_Data': [10, 20, 30],
    'A_Value': [1, 2, 3],
    'C_Item': [100, 200, 300],
    'B_Name': ['Alice', 'Bob', 'Charlie']
}
df_unsorted_cols = pd.DataFrame(data)

print("--- Original DataFrame (Columns Not Sorted) ---")
print(df_unsorted_cols)
print("\nOriginal Column Order:", df_unsorted_cols.columns.tolist())
print("-" * 50)

# --- 1. Sort Columns by their Names in Ascending Order ---
# axis=1 tells sort_index to operate on the columns
df_sorted_cols_asc = df_unsorted_cols.sort_index(axis=1, ascending=True)

print("\n--- DataFrame Sorted by Column Names (Ascending) ---")
print(df_sorted_cols_asc)
print("\nNew Column Order (Ascending):", df_sorted_cols_asc.columns.tolist())
print("-" * 50)

# --- 2. Sort Columns by their Names in Descending Order ---
df_sorted_cols_desc = df_unsorted_cols.sort_index(axis=1, ascending=False)

print("\n--- DataFrame Sorted by Column Names (Descending) ---")
print(df_sorted_cols_desc)
print("\nNew Column Order (Descending):", df_sorted_cols_desc.columns.tolist())
print("-" * 50)

# --- Contrast with sort_values(axis=1) for clarity ---
print("\n--- IMPORTANT CONTRAST: sort_values(axis=1) ---")
print("sort_values(axis=1) sorts the *values within each row*.")
print("It does NOT reorder the columns, but rather reorders the values PER ROW.")

df_values_sorted_row_wise = df_unsorted_cols[['A_Value', 'C_Item', 'Z_Data']].apply(lambda row: sorted(row), axis=1, result_type='expand')
# Note: apply(sorted, axis=1) would return a Series of lists.
# Using a slightly more complex apply with expand to create a new DataFrame.
# For simple cases like numerical columns, sort_values(axis=1) can sometimes imply reordering values.
# However, for mixed types or general use, it's typically about sorting the *values* for comparison or specific operations.

# A common use case for sort_values(axis=1) is finding min/max across rows:
df_min_max_row_wise = df_unsorted_cols[['A_Value', 'C_Item', 'Z_Data']].copy()
df_min_max_row_wise['Min_Value_in_Row'] = df_min_max_row_wise.min(axis=1)
df_min_max_row_wise['Max_Value_in_Row'] = df_min_max_row_wise.max(axis=1)
print("\nDataFrame with Min/Max values per row (using min/max with axis=1):")
print(df_min_max_row_wise)
print("-" * 50)
Key Differences:
df.sort_index(axis=1): This method reorders the actual columns of your DataFrame based on their names (labels). It changes the column order in the DataFrame itself.
df.sort_values(axis=1): This method sorts the values within each row across the specified columns. It does not change the order of the columns in the DataFrame's structure, but it can be used to reorder the values within a row or to find row-wise statistics (like min/max).
So, when you want to arrange your columns alphabetically or in a specific order by their names, df.sort_index(axis=1) is the method you need.


'''

In [19]:
#sorted(df.columns)

###########################


'''
common Pythonic way to get a sorted list of your DataFrame's column names: sorted(df.columns).

Here's what it does and how it differs from df.sort_index(axis=1):

What sorted(df.columns) Does
df.columns returns a Pandas Index object containing all the column labels of your DataFrame.
sorted() is a built-in Python function that takes an iterable (like df.columns) and returns a new Python list containing all items from the iterable in sorted order.
Key point: sorted(df.columns) produces a Python list of sorted column names; it does NOT modify the order of columns in your DataFrame.

Example
'''

import pandas as pd

# Create a DataFrame with columns in an arbitrary order
data = {
    'Z_Score': [10, 20, 30],
    'A_Name': ['Alice', 'Bob', 'Charlie'],
    'C_Value': [100, 200, 300],
    'B_Type': ['X', 'Y', 'Z']
}
df = pd.DataFrame(data)

print("--- Original DataFrame ---")
print(df)
print("\nOriginal DataFrame Column Order (df.columns):", df.columns.tolist())
print("-" * 50)

# Get a sorted list of column names
sorted_column_names = sorted(df.columns)

print("\n--- Result of sorted(df.columns) ---")
print("Type of result:", type(sorted_column_names))
print("Sorted column names (Python list):", sorted_column_names)
print("-" * 50)

# Verify that the original DataFrame's column order is unchanged
print("\n--- Original DataFrame (still unchanged) ---")
print(df)
print("Original DataFrame Column Order (df.columns):", df.columns.tolist())
print("-" * 50)

# How to use this list to reorder the DataFrame's columns (if desired)
df_reordered_cols = df[sorted_column_names]
print("\n--- DataFrame with Columns Reordered using sorted(df.columns) ---")
print(df_reordered_cols)
print("New DataFrame Column Order:", df_reordered_cols.columns.tolist())
print("-" * 50)



'''
Contrast with df.sort_index(axis=1)
sorted(df.columns): Returns a Python list of sorted column names. It does not alter the DataFrame. You then use this list to create a new DataFrame with the desired column order, like df[sorted_column_names].
df.sort_index(axis=1): This is a Pandas DataFrame method that returns a new Pandas DataFrame (or modifies in place if inplace=True) with its columns sorted by their labels. It directly operates on and modifies the DataFrame's structure.
Choose sorted(df.columns) when you just need the column names as a sorted list for, say, printing, looping, or for manually reindexing later. Choose df.sort_index(axis=1) when you want the DataFrame itself to have its columns structurally sorted by their names.


'''

--- Original DataFrame ---
   Z_Score   A_Name  C_Value B_Type
0       10    Alice      100      X
1       20      Bob      200      Y
2       30  Charlie      300      Z

Original DataFrame Column Order (df.columns): ['Z_Score', 'A_Name', 'C_Value', 'B_Type']
--------------------------------------------------

--- Result of sorted(df.columns) ---
Type of result: <class 'list'>
Sorted column names (Python list): ['A_Name', 'B_Type', 'C_Value', 'Z_Score']
--------------------------------------------------

--- Original DataFrame (still unchanged) ---
   Z_Score   A_Name  C_Value B_Type
0       10    Alice      100      X
1       20      Bob      200      Y
2       30  Charlie      300      Z
Original DataFrame Column Order (df.columns): ['Z_Score', 'A_Name', 'C_Value', 'B_Type']
--------------------------------------------------

--- DataFrame with Columns Reordered using sorted(df.columns) ---
    A_Name B_Type  C_Value  Z_Score
0    Alice      X      100       10
1      Bob      Y    

"\nContrast with df.sort_index(axis=1)\nsorted(df.columns): Returns a Python list of sorted column names. It does not alter the DataFrame. You then use this list to create a new DataFrame with the desired column order, like df[sorted_column_names].\ndf.sort_index(axis=1): This is a Pandas DataFrame method that returns a new Pandas DataFrame (or modifies in place if inplace=True) with its columns sorted by their labels. It directly operates on and modifies the DataFrame's structure.\nChoose sorted(df.columns) when you just need the column names as a sorted list for, say, printing, looping, or for manually reindexing later. Choose df.sort_index(axis=1) when you want the DataFrame itself to have its columns structurally sorted by their names.\n\n\n"

In [21]:
# str.capitalize() and str.len()
###############################

''' 

Pandas offers powerful string methods via the .str accessor, which allows you to apply string operations to entire Series (columns) containing text data.

Let's use df_titanic['Name'].str.len() to get the length of each name and df_titanic['Name'].str.capitalize() to capitalize the first letter of each name.

'''

import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print("\n--- Original 'Name' Column Head ---")
    print(df_titanic['Name'].head())
    print("-" * 60)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# --- 1. Use .str.len() to get the length of each name ---
# This will return a Series containing the length of each string in the 'Name' column.
name_lengths = df_titanic['Name'].str.len()

# Add this as a new column to the DataFrame for display
df_titanic['Name_Length'] = name_lengths

print("\n--- 'Name' Column with Length (using .str.len()) ---")
print(df_titanic[['Name', 'Name_Length']].head())
print(f"\nAverage name length: {df_titanic['Name_Length'].mean():.2f}")
print("-" * 60)


# --- 2. Use .str.capitalize() to capitalize the first letter of each name ---
# This returns a Series where the first character of each string is capitalized,
# and the rest are lowercased.
capitalized_names = df_titanic['Name'].str.capitalize()

# Add this as a new column (or overwrite for demonstration)
df_titanic['Name_Capitalized'] = capitalized_names

print("\n--- 'Name' Column Capitalized (using .str.capitalize()) ---")
print(df_titanic[['Name', 'Name_Capitalized']].head())
print("-" * 60)

# --- Displaying a few names to show the effect of capitalize ---
# Note: capitalize only capitalizes the *first* letter of the entire string.
# It doesn't handle individual words like .str.title() would.
print("\n--- Examples of .str.capitalize() vs .str.title() ---")
sample_names = pd.Series(["john doe", "mr. smith", "dr. sarah jane"])
print(f"Original:\n{sample_names}")
print(f"\n.str.capitalize():\n{sample_names.str.capitalize()}")
print(f"\n.str.title():\n{sample_names.str.title()}") # Capitalizes first letter of each word
print("-" * 60)

'''
Explanation:
.str Accessor: When you see Series.str, it means you're accessing a set of string methods that can be applied to each element of the Series. These methods are vectorized, meaning they are very efficient for large datasets.
.str.len(): Calculates the length of each string. For a name like "Braund, Mr. Owen Harris", it counts all characters including spaces and punctuation.
.str.capitalize(): Converts the first character of each string to uppercase and all remaining characters to lowercase. This is important to note: it doesn't capitalize the first letter of each word, only the very first letter of the entire string. If you wanted to capitalize the first letter of each word (like in titles), you'd typically use .str.title().


'''

Titanic dataset loaded successfully!

--- Original 'Name' Column Head ---
0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object
------------------------------------------------------------

--- 'Name' Column with Length (using .str.len()) ---
                                                Name  Name_Length
0                            Braund, Mr. Owen Harris           23
1  Cumings, Mrs. John Bradley (Florence Briggs Th...           51
2                             Heikkinen, Miss. Laina           22
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)           44
4                           Allen, Mr. William Henry           24

Average name length: 26.97
------------------------------------------------------------

--- 'Name' Column Capita

'\nExplanation:\n.str Accessor: When you see Series.str, it means you\'re accessing a set of string methods that can be applied to each element of the Series. These methods are vectorized, meaning they are very efficient for large datasets.\n.str.len(): Calculates the length of each string. For a name like "Braund, Mr. Owen Harris", it counts all characters including spaces and punctuation.\n.str.capitalize(): Converts the first character of each string to uppercase and all remaining characters to lowercase. This is important to note: it doesn\'t capitalize the first letter of each word, only the very first letter of the entire string. If you wanted to capitalize the first letter of each word (like in titles), you\'d typically use .str.title().\n\n\n'

In [None]:
# Regex
################

'''

Regular Expressions (often shortened to "regex" or "regexp") are sequences of characters that define a search pattern.
They are incredibly powerful for finding, matching, and manipulating text based on complex rules.
Think of them as a highly advanced "find and replace" tool.

Basic Regex Components
Literals: Most characters match themselves (e.g., a matches 'a').
Metacharacters: Special characters with specific meanings:
.: Matches any single character (except newline).
*: Matches 0 or more occurrences of the preceding character/group.
+: Matches 1 or more occurrences of the preceding character/group.
?: Matches 0 or 1 occurrence of the preceding character/group.
[]: Matches any one of the characters inside the brackets (e.g., [abc] matches 'a', 'b', or 'c').
[^]: Matches any character not inside the brackets (e.g., [^0-9] matches any non-digit).
(): Used for grouping and capturing substrings.
|: OR operator (e.g., cat|dog matches 'cat' or 'dog').
\: Escape character (e.g., \. matches a literal dot, \d matches a digit).
Anchors:
^: Matches the beginning of the string.
$: Matches the end of the string.
Quantifiers:
{n}: Matches exactly n occurrences.
{n,}: Matches n or more occurrences.
{n,m}: Matches between n and m occurrences.
Simple Example: Email ID Validation (Python's re module)
Let's create a very basic (and not exhaustive for real-world scenarios) regex for an email address.

Regex Pattern: ^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$

^: Start of the string.
[a-zA-Z0-9._%+-]+: One or more letters, numbers, dots, underscores, percents, plus, or hyphens (the "username" part).
@: Matches the literal '@' symbol.
[a-zA-Z0-9.-]+: One or more letters, numbers, dots, or hyphens (the "domain name" part).
\.: Matches a literal dot (escaped because . is a metacharacter).
[a-zA-Z]{2,}: Two or more letters (the "top-level domain" like .com, .org, .net).
$: End of the string.
Python

import re

email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
# The 'r' before the string means it's a "raw string", which prevents backslashes
# from being interpreted as escape sequences by Python itself. This is best practice for regex.

email1 = "test.user@example.com"
email2 = "invalid-email"
email3 = "another.one@sub.domain.co.uk"

print("--- Email ID Validation Example ---")

# re.match() checks for a match only at the beginning of the string
if re.match(email_pattern, email1):
    print(f"'{email1}' is a VALID email (using re.match)")
else:
    print(f"'{email1}' is an INVALID email (using re.match)")

if re.match(email_pattern, email2):
    print(f"'{email2}' is a VALID email (using re.match)")
else:
    print(f"'{email2}' is an INVALID email (using re.match)")

print("\n(Using re.search to find patterns anywhere in the string)")
# re.search() finds a match anywhere in the string
email_in_text = "My email is test@domain.org, please contact me."
match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", email_in_text)
if match:
    print(f"Found email: '{match.group(0)}' in the text.")
else:
    print("No email found in the text.")
print("-" * 50)
Example Using Titanic Dataset: Extracting Titles
In the Titanic dataset, the 'Name' column often contains titles like "Mr.", "Mrs.", "Miss.", "Dr.", "Rev.", "Master.", etc. We can use regex to extract these titles.

Regex Pattern: ([A-Za-z]+)\.

( ): This creates a capturing group. Whatever matches inside these parentheses will be extracted.
[A-Za-z]+: Matches one or more uppercase or lowercase English letters.
\.: Matches a literal dot (the period after the title).
Python

import pandas as pd

# Load the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print("\n--- Original 'Name' Column Head ---")
    print(df_titanic['Name'].head(10))
    print("-" * 60)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# Regex to extract titles from the Name column
title_pattern = r'([A-Za-z]+)\.' # Captures one or more letters followed by a dot

# Using .str.extract() to extract the captured group
# .str.extract() returns a DataFrame with one column for each capturing group.
df_titanic['Title'] = df_titanic['Name'].str.extract(title_pattern, expand=False)
# expand=False makes it return a Series if there's only one capturing group.

print("\n--- DataFrame with Extracted Titles using .str.extract() ---")
print(df_titanic[['Name', 'Title']].head(10))
print("-" * 60)

print("\n--- Value Counts of Extracted Titles ---")
print(df_titanic['Title'].value_counts())
print("-" * 60)

# Another common .str method for regex: .str.contains()
# Let's find all names containing 'Mrs.'
mrs_names = df_titanic[df_titanic['Name'].str.contains(r'Mrs\.', na=False)]
print("\n--- Passengers with 'Mrs.' in their Name (using .str.contains()) ---")
print(mrs_names[['Name', 'Title', 'Sex', 'Age']].head())
print(f"\nTotal passengers with 'Mrs.' in name: {len(mrs_names)}")
print("-" * 60)
Key Points for Regex in Pandas:
.str Accessor: All string methods in Pandas that work with regex patterns are accessed through the .str accessor (e.g., Series.str.contains(), Series.str.extract(), Series.str.replace(), Series.str.match()).
regex=True (or implied): For many .str methods, regex=True is the default (or implied) when the method supports regex (e.g., contains, match, extract, replace). You typically don't need to explicitly set it.
Raw Strings (r"..."): Always use raw strings (r"your_pattern") for regex patterns in Python. This prevents backslashes from being interpreted as Python escape sequences, which is crucial for regex (e.g., \d means digit in regex, but \d would be an invalid escape sequence in a normal string).
Regex is a vast topic, but understanding these basics allows you to perform powerful text manipulation in Pandas and Python.

'''



In [None]:
'''

Extract parts of the name from the Titanic dataset's 'Name' column using regular expressions, 
specifically demonstrating the effect of expand=False versus expand=True on the output's structure and dtype.

The 'Name' column in the Titanic dataset is typically formatted as "Surname, Title. GivenName MiddleName". 
We'll write a regex to extract the Surname and the First Name (the first given name after the title).

Regex Pattern: r'([^,]+),\s*[A-Za-z]+\.\s*([A-Za-z]+)'

Let's break down this regex:

( ): These create capturing groups. Whatever matches inside them will be extracted into the output.
([^,]+):
[^,]: Matches any character that is not a comma.
+: Matches the preceding character set one or more times.
This first capturing group ([^,]+) will capture the Surname (e.g., "Braund", "Cumings").
,\s*: Matches a literal comma , followed by zero or more whitespace characters (\s*). This part is matched but not captured.
[A-Za-z]+\.\s*: Matches the Title part (e.g., "Mr.", "Mrs.", "Miss.").
[A-Za-z]+: One or more letters (for "Mr", "Mrs", etc.).
\.: A literal dot (escaped with \).
\s*: Zero or more whitespace characters after the title. This part is matched but not captured.
([A-Za-z]+):
[A-Za-z]+: Matches one or more letters.
This second capturing group ([A-Za-z]+) will capture the First Name (e.g., "Owen", "Elizabeth").
Now, let's see expand=False vs expand=True:

Python

import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print("\n--- Original 'Name' Column Head (for reference) ---")
    print(df_titanic['Name'].head(7))
    print("-" * 70)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# Define the regex pattern
name_pattern = r'([^,]+),\s*[A-Za-z]+\.\s*([A-Za-z]+)'

# --- 1. Using expand=False ---
# When expand=False and there are multiple capturing groups,
# it returns a Series where each element is a tuple of the captured groups.
extracted_names_false = df_titanic['Name'].str.extract(name_pattern, expand=False)

print("\n--- Result with expand=False ---")
print("Type of result:", type(extracted_names_false))
print("Head of extracted data:\n", extracted_names_false.head(7))
print("\nDataFrame Info (to see dtype):")
extracted_names_false.info()
print("-" * 70)

# --- 2. Using expand=True ---
# When expand=True and there are multiple capturing groups,
# it returns a DataFrame with a column for each captured group.
extracted_names_true = df_titanic['Name'].str.extract(name_pattern, expand=True)

# Rename columns for clarity
extracted_names_true.columns = ['Surname', 'FirstName']

print("\n--- Result with expand=True ---")
print("Type of result:", type(extracted_names_true))
print("Head of extracted data:\n", extracted_names_true.head(7))
print("\nDataFrame Info (to see dtypes):")
extracted_names_true.info()
print("-" * 70)

# --- Add to original DataFrame for demonstration ---
df_titanic[['Surname', 'FirstName']] = df_titanic['Name'].str.extract(name_pattern, expand=True)
print("\n--- df_titanic with new 'Surname' and 'FirstName' columns ---")
print(df_titanic[['Name', 'Surname', 'FirstName']].head(7))
print("-" * 70)
Explanation of expand
expand=False:

Output Type: Returns a Pandas Series.
Content: If your regex has:
One capturing group: The Series elements will be the string matched by that group.
Multiple capturing groups (as in our example): The Series elements will be tuples, where each tuple
 contains the strings matched by the respective capturing groups in that row. If a group doesn't match
   for a specific row, its corresponding element in the tuple will be NaN.
dtype: The dtype of the Series will typically be object, as it's storing heterogeneous tuples.
expand=True:

Output Type: Returns a Pandas DataFrame.
Content: Each capturing group in your regex becomes a separate column in the resulting DataFrame.
dtype: Each column in the DataFrame will have a dtype inferred from the data it contains (e.g., object for strings, or float64 if NaNs are present and it was numeric). This is generally more useful for direct integration into your DataFrame.
In most scenarios where you're extracting multiple pieces of information from a string using regex capturing groups, expand=True is the more convenient option as it directly provides a structured DataFrame ready for further use.

'''

In [None]:
'''

explore str.split(), str.lstrip(), and str.rstrip() using the df_titanic['Name'] column.
These are incredibly useful for cleaning and parsing string data.

'''

import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print("\n--- Original 'Name' Column Head ---")
    print(df_titanic['Name'].head(7))
    print("-" * 70)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")


# --- 1. str.split() ---
# Splits strings in the Series by a specified delimiter.
# If expand=True, it returns a DataFrame with new columns for each part.
# If expand=False (default), it returns a Series of lists.

# Split the 'Name' column by the comma (',')
name_parts = df_titanic['Name'].str.split(',', expand=True)

# Rename the new columns for clarity
name_parts.columns = ['LastName', 'OtherNameInfo']

print("\n--- Result of df_titanic['Name'].str.split(',', expand=True) ---")
print(name_parts.head(7))
print("\nNotice the leading space in 'OtherNameInfo' (e.g., ' Mr. Owen Harris')")
print("-" * 70)


# --- 2. str.lstrip() ---
# Removes leading characters (from the left side of the string).
# By default, removes leading whitespace. You can also specify characters to remove.

# Let's remove the leading space from the 'OtherNameInfo' column created above
cleaned_other_name_info = name_parts['OtherNameInfo'].str.lstrip()

print("\n--- Result of name_parts['OtherNameInfo'].str.lstrip() ---")
print(pd.DataFrame({
    'Original_OtherNameInfo': name_parts['OtherNameInfo'].head(7),
    'Lstripped_OtherNameInfo': cleaned_other_name_info.head(7)
}))
print("\nNotice the leading space is gone from 'Lstripped_OtherNameInfo'")
print("-" * 70)

# Example: Removing specific characters from the left
sample_text_lstrip = pd.Series(["###Hello World", "##Another#"])
print("\n--- str.lstrip() with specific characters ---")
print(f"Original:\n{sample_text_lstrip}")
print(f"lstrip('#'):\n{sample_text_lstrip.str.lstrip('#')}")
print("-" * 70)


# --- 3. str.rstrip() ---
# Removes trailing characters (from the right side of the string).
# By default, removes trailing whitespace. You can also specify characters to remove.

# Let's imagine we want to remove the trailing period from the 'Title' part (e.g., "Mr.")
# First, let's extract the title (we did this in a previous example, re-doing for context)
df_titanic['Title_with_Dot'] = df_titanic['Name'].str.extract(r'([A-Za-z]+)\.', expand=False)

# Now, rstrip the dot
title_stripped = df_titanic['Title_with_Dot'].str.rstrip('.')

print("\n--- Result of df_titanic['Title_with_Dot'].str.rstrip('.') ---")
print(pd.DataFrame({
    'Original_Title': df_titanic['Title_with_Dot'].head(7),
    'Rstripped_Title': title_stripped.head(7)
}))
print("\nNotice the trailing dot is gone from 'Rstripped_Title'")
print("-" * 70)

# Example: Removing specific characters from the right
sample_text_rstrip = pd.Series(["Hello World###", "#Another##"])
print("\n--- str.rstrip() with specific characters ---")
print(f"Original:\n{sample_text_rstrip}")
print(f"rstrip('#'):\n{sample_text_rstrip.str.rstrip('#')}")
print("-" * 70)

# Example: Using all three in sequence for a common cleaning task
df_titanic['Processed_Name'] = df_titanic['Name'].str.split(',').str[0].str.strip()
print("\n--- Combined Example: Extracting Cleaned Last Name ---")
print(df_titanic[['Name', 'Processed_Name']].head(7))
print("Here, we split by comma, take the first part, and then strip any leading/trailing whitespace.")
print("-" * 70)

'''
Key Points:
.str Accessor: All these methods are part of the .str accessor, which is necessary to apply string operations to entire Pandas Series.
str.split(delimiter, expand=True/False):
Breaks a string into a list of substrings based on a delimiter.
expand=True is incredibly useful for parsing structured strings into new columns directly.
expand=False (default) returns a Series where each element is a Python list of the split parts. You'd then typically use .str[index] to access specific parts.
str.lstrip(chars=None): Removes characters from the left (leading) side of the string.
If chars is None (default), it removes all leading whitespace characters (spaces, tabs, newlines).
If chars is specified (e.g., '#', '., '), it removes any combination of those characters from the left until a character not in chars is encountered.
str.rstrip(chars=None): Removes characters from the right (trailing) side of the string.
Behaves identically to lstrip(), but operates on the right side.
str.strip(chars=None): Removes characters from both the left and right sides. This is often used for general whitespace cleaning.
These methods are fundamental for text preprocessing and feature engineering in Pandas.


'''