In [13]:
import pandas as pd
import numpy as  np

!pip install matplotlib
import matplotlib.pyplot as plt

Collecting matplotlib
  Downloading matplotlib-3.10.3-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.58.4-cp312-cp312-win_amd64.whl.metadata (108 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.2.1-cp312-cp312-win_amd64.whl.metadata (9.1 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.3-cp312-cp312-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:--:--
   ---------- ----------------------------- 2.1/8.1 MB 13.0 MB/s eta 0:00:01
 

In [2]:
# Count all passengers with 'miss' in name
######################################################

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# --- Count passengers with "Miss" in their name ---

# Create a boolean Series where True indicates the name contains "Miss"
# case=False makes the search case-insensitive ('miss', 'Miss', 'MISS' etc.)
# na=False ensures that NaN values in the 'Name' column don't cause an error
# and are treated as not containing the string (resulting in False for that row).
contains_miss = df_titanic['Name'].str.contains('Miss', case=False, na=False)

# Count the number of True values in the boolean Series
# Summing a boolean Series treats True as 1 and False as 0.
num_passengers_with_miss = contains_miss.sum()

print(f"\nNumber of passengers with 'Miss' in their name: {num_passengers_with_miss}")

print("\n--- Examples of names containing 'Miss' ---")
# Display the first few names that contain "Miss"
print(df_titanic[contains_miss]['Name'].head(10))
print("-" * 60)

Titanic dataset loaded successfully!

Number of passengers with 'Miss' in their name: 182

--- Examples of names containing 'Miss' ---
2                   Heikkinen, Miss. Laina
10         Sandstrom, Miss. Marguerite Rut
11                Bonnell, Miss. Elizabeth
14    Vestrom, Miss. Hulda Amanda Adolfina
22             McGowan, Miss. Anna "Annie"
24           Palsson, Miss. Torborg Danira
28           O'Dwyer, Miss. Ellen "Nellie"
32                Glynn, Miss. Mary Agatha
38      Vander Planke, Miss. Augusta Maria
39             Nicola-Yarred, Miss. Jamila
Name: Name, dtype: object
------------------------------------------------------------


In [3]:
import pandas as pd

# Define the data as a dictionary where keys are column names
data = {
    'Name': ['Monal Kumar', 'Bhuvika', 'Riyan', 'Niranjan'],
    'Email': ['monal.kumar@example.com', 'bhuvika.dev@email.com', 'riyan.m@test.org', 'niranjan_k@domain.net']
}

# Create the Pandas DataFrame
df_contacts = pd.DataFrame(data)

print("--- Contacts DataFrame ---")
print(df_contacts)

--- Contacts DataFrame ---
          Name                    Email
0  Monal Kumar  monal.kumar@example.com
1      Bhuvika    bhuvika.dev@email.com
2        Riyan         riyan.m@test.org
3     Niranjan    niranjan_k@domain.net


Let's use the str.replace() method to replace "k" with "kumar" in the 'Name' column of your df_contacts DataFrame. When regex=False, the method treats the pattern as a literal string.

In [4]:
import pandas as pd

# Create the DataFrame (as generated previously)
data = {
    'Name': ['Monal Kumar', 'Bhuvika', 'Riyan', 'Niranjan'],
    'Email': ['monal.kumar@example.com', 'bhuvika.dev@email.com', 'riyan.m@test.org', 'niranjan_k@domain.net']
}
df_contacts = pd.DataFrame(data)

print("--- Original Contacts DataFrame ---")
print(df_contacts)
print("—" * 50)

# Apply str.replace('k', 'kumar', regex=False) to the 'Name' column
# This will replace all *lowercase* 'k's with 'kumar'.
# Since regex=False, it won't interpret 'k' as a special regex character.
df_contacts['Name_Replaced_k'] = df_contacts['Name'].str.replace('k', 'kumar', regex=False)

print("\n--- DataFrame after replacing 'k' with 'kumar' (regex=False) ---")
print(df_contacts[['Name', 'Name_Replaced_k']])
print("—" * 50)

# Let's try replacing 'K' with 'Kumar' (case-sensitive by default)
df_contacts['Name_Replaced_K'] = df_contacts['Name'].str.replace('K', 'Kumar', regex=False)

print("\n--- DataFrame after replacing 'K' with 'Kumar' (regex=False) ---")
print(df_contacts[['Name', 'Name_Replaced_K']])
print("—" * 50)

# If you want a case-insensitive replace without regex, you usually need two steps
# or to convert to a consistent case first.
# Using regex=True would allow case-insensitive replacement with a flag.

--- Original Contacts DataFrame ---
          Name                    Email
0  Monal Kumar  monal.kumar@example.com
1      Bhuvika    bhuvika.dev@email.com
2        Riyan         riyan.m@test.org
3     Niranjan    niranjan_k@domain.net
——————————————————————————————————————————————————

--- DataFrame after replacing 'k' with 'kumar' (regex=False) ---
          Name Name_Replaced_k
0  Monal Kumar     Monal Kumar
1      Bhuvika     Bhuvikumara
2        Riyan           Riyan
3     Niranjan        Niranjan
——————————————————————————————————————————————————

--- DataFrame after replacing 'K' with 'Kumar' (regex=False) ---
          Name  Name_Replaced_K
0  Monal Kumar  Monal Kumarumar
1      Bhuvika          Bhuvika
2        Riyan            Riyan
3     Niranjan         Niranjan
——————————————————————————————————————————————————


In [None]:
'''
When you set regex=False, Pandas treats the first argument ('k' in this case) as a literal string.
 This means it will only # replace exact matches of "k" and won't use any regular expression patterns.
 Also, by default, str.replace() is case-sensitive when regex=False. That's why "Monal Kumar" remains unchanged 
 when replacing lowercase "k" but changes when replacing uppercase "K".
'''

In [None]:
# Pandas Display Maximum Rows and MAXIMUM Columns

'''Controlling how Pandas DataFrames are displayed in your console or environment, specifically the maximum number of rows and columns shown before truncation (when Pandas inserts ...). This is done using pd.set_option().

pd.set_option() for Display Control
Pandas has a comprehensive options system that allows you to customize its behavior. Display options are prefixed with display..

display.max_rows: Controls the maximum number of rows to display. When a DataFrame has more rows than this setting, Pandas will show only the first and last few rows, with ... in between.
display.max_columns: Controls the maximum number of columns to display. When a DataFrame has more columns than this setting, Pandas will show only the first and last few columns, with ... in between.
Let's demonstrate using the Titanic dataset, which has a moderate number of rows and columns.

'''

In [6]:
import pandas as pd

# Load the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print(f"DataFrame shape: {df_titanic.shape}")
    print(f"Number of columns: {len(df_titanic.columns)}")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

print("\n" + "="*80 + "\n")

# --- Default Display Settings ---
print("--- Default Display (may vary based on environment) ---")
print(df_titanic) # Pandas will auto-truncate if the DataFrame is large
print("\n" + "="*80 + "\n")

# --- 1. Set Max Rows ---
# Let's set max_rows to a small number (e.g., 10) to force truncation
pd.set_option('display.max_rows', 10)

print("--- Display with max_rows = 10 ---")
print(df_titanic)
print("\nNotice the '...' in the middle of the rows.")
print("\n" + "="*80 + "\n")

# --- 2. Set Max Columns ---
# Let's make a dummy DataFrame with many columns to demonstrate max_columns
# First, reset max_rows so it doesn't interfere as much
pd.set_option('display.max_rows', 60) # A bit more than default to see more of columns

dummy_data = {f'col_{i}': range(5) for i in range(20)} # 20 columns
df_wide = pd.DataFrame(dummy_data)

print("--- DataFrame with Many Columns (Default/Current max_columns) ---")
print(df_wide) # May be truncated depending on your default
print(f"Number of columns in df_wide: {len(df_wide.columns)}")
print("\n" + "="*80 + "\n")

# Now set max_columns to a smaller number (e.g., 5) to force truncation
pd.set_option('display.max_columns', 5)

print("--- Display with max_columns = 5 ---")
print(df_wide)
print("\nNotice the '...' between the columns.")
print("\n" + "="*80 + "\n")

# --- How to Display All Rows/Columns (No Truncation) ---
# Set to None to disable truncation for rows or columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print("--- Display with max_rows = None and max_columns = None (Display All) ---")
# This might print a very long output depending on your actual data
print(df_titanic) # This will now print all rows of Titanic dataset
print("\nThis printed all rows of the Titanic dataset.")
print("\n" + "="*80 + "\n")

print("--- Displaying the wide DataFrame with max_columns = None ---")
print(df_wide) # This will now print all 20 columns
print("\nThis printed all columns of the wide DataFrame.")
print("\n" + "="*80 + "\n")


# --- Resetting to Default Settings ---
# You can use pd.reset_option() or set them explicitly to their known defaults
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

print("--- Display options reset to default ---")
print(df_titanic)
print("Default display behavior is now restored.")

Titanic dataset loaded successfully!
DataFrame shape: (891, 12)
Number of columns: 12


--- Default Display (may vary based on environment) ---
     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4   

In [None]:
'''
You're asking about controlling how Pandas DataFrames are displayed in your console or environment, specifically the maximum number of rows and columns shown before truncation (when Pandas inserts ...). This is done using pd.set_option().

pd.set_option() for Display Control
Pandas has a comprehensive options system that allows you to customize its behavior. Display options are prefixed with display..

display.max_rows: Controls the maximum number of rows to display. When a DataFrame has more rows than this setting, Pandas will show only the first and last few rows, with ... in between.
display.max_columns: Controls the maximum number of columns to display. When a DataFrame has more columns than this setting, Pandas will show only the first and last few columns, with ... in between.
Let's demonstrate using the Titanic dataset, which has a moderate number of rows and columns.

Python

import pandas as pd

# Load the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print(f"DataFrame shape: {df_titanic.shape}")
    print(f"Number of columns: {len(df_titanic.columns)}")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

print("\n" + "="*80 + "\n")

# --- Default Display Settings ---
print("--- Default Display (may vary based on environment) ---")
print(df_titanic) # Pandas will auto-truncate if the DataFrame is large
print("\n" + "="*80 + "\n")

# --- 1. Set Max Rows ---
# Let's set max_rows to a small number (e.g., 10) to force truncation
pd.set_option('display.max_rows', 10)

print("--- Display with max_rows = 10 ---")
print(df_titanic)
print("\nNotice the '...' in the middle of the rows.")
print("\n" + "="*80 + "\n")

# --- 2. Set Max Columns ---
# Let's make a dummy DataFrame with many columns to demonstrate max_columns
# First, reset max_rows so it doesn't interfere as much
pd.set_option('display.max_rows', 60) # A bit more than default to see more of columns

dummy_data = {f'col_{i}': range(5) for i in range(20)} # 20 columns
df_wide = pd.DataFrame(dummy_data)

print("--- DataFrame with Many Columns (Default/Current max_columns) ---")
print(df_wide) # May be truncated depending on your default
print(f"Number of columns in df_wide: {len(df_wide.columns)}")
print("\n" + "="*80 + "\n")

# Now set max_columns to a smaller number (e.g., 5) to force truncation
pd.set_option('display.max_columns', 5)

print("--- Display with max_columns = 5 ---")
print(df_wide)
print("\nNotice the '...' between the columns.")
print("\n" + "="*80 + "\n")

# --- How to Display All Rows/Columns (No Truncation) ---
# Set to None to disable truncation for rows or columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print("--- Display with max_rows = None and max_columns = None (Display All) ---")
# This might print a very long output depending on your actual data
print(df_titanic) # This will now print all rows of Titanic dataset
print("\nThis printed all rows of the Titanic dataset.")
print("\n" + "="*80 + "\n")

print("--- Displaying the wide DataFrame with max_columns = None ---")
print(df_wide) # This will now print all 20 columns
print("\nThis printed all columns of the wide DataFrame.")
print("\n" + "="*80 + "\n")


# --- Resetting to Default Settings ---
# You can use pd.reset_option() or set them explicitly to their known defaults
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

print("--- Display options reset to default ---")
print(df_titanic)
print("Default display behavior is now restored.")
Important Notes:

Global Setting: pd.set_option() changes the display behavior globally for your current Python session. All subsequent DataFrames you print will adhere to these settings until you change them again or reset them.
Context: These settings only affect the display of the DataFrame. They do not change the actual data or its underlying structure. The DataFrame still holds all its rows and columns in memory.
Performance: Setting max_rows or max_columns to None for very large DataFrames can result in a massive amount of output to your console, which might be slow to render or consume a lot of memory in your terminal. Use it judiciously.
Other Display Options: There are many other display options, such as display.width, display.precision, display.float_format, etc. You can explore them in the Pandas documentation.
'''

In [None]:
'''
pd.option_context(), which is a very powerful and elegant way to temporarily set Pandas options for a specific block of code.

What is pd.option_context()?
pd.option_context() is a context manager in Pandas. It allows you to:

Change one or more Pandas options for a specific code block.
Ensure that these options are automatically reverted to their previous values once the code block (the with statement) is exited, regardless of how the block is exited (normal completion, error, etc.).
This is incredibly useful because it prevents you from having to manually save the original option values, change them, and then remember to restore them later, which can be error-prone.

Why is it useful?
Cleanliness: No need for manual option saving and restoring.
Safety: Guarantees that options are reset, even if an error occurs within the with block.
Readability: Clearly defines the scope where a specific option setting applies.
Avoids Side Effects: Ensures that temporary option changes don't unintentionally affect other parts of your code or subsequent operations.
How to use pd.option_context()
You use it with a with statement, passing the option name(s) and their desired temporary value(s) as arguments.

'''

In [None]:
# syntax
'''

with pd.option_context('option.name1', value1, 'option.name2', value2, ...):
    # Code where the options are temporarily active
    # ...
# Options revert to their original values here'
''
'''

In [None]:
''''

Let's demonstrate with the Titanic dataset, temporarily changing display.max_rows to 10 
to print a truncated DataFrame, and then showing that it reverts to the default (or previously set) value outside the with block.

''''

In [8]:
import pandas as pd

# Load the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print(f"DataFrame shape: {df_titanic.shape}")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

print("\n" + "="*80 + "\n")

# --- Get current default max_rows setting ---
initial_max_rows = pd.get_option('display.max_rows')
print(f"Initial display.max_rows setting: {initial_max_rows}")
print("\n--- Printing df_titanic with initial setting (may be truncated or full) ---")
print(df_titanic)
print("\n" + "="*80 + "\n")


# --- Use pd.option_context to temporarily change max_rows ---
print("--- INSIDE pd.option_context(display.max_rows = 10) ---")
with pd.option_context('display.max_rows', 10):
    print(f"display.max_rows inside context: {pd.get_option('display.max_rows')}")
    print("Printing df_titanic (should be truncated to 10 rows):")
    print(df_titanic)
    print("\nNotice the '...' in the middle of the rows, as max_rows is temporarily 10.")
    # You can do any operations here where you need this temporary setting
    # e.g., generate a specific report view
    print("Doing some operation inside the context...")

print("\n" + "="*80 + "\n")

# --- Outside pd.option_context ---
print("--- OUTSIDE pd.option_context ---")
print(f"display.max_rows outside context: {pd.get_option('display.max_rows')}")
print("Printing df_titanic again (should revert to initial setting):")
print(df_titanic)
print("\nNotice the display.max_rows has automatically reverted to its initial setting.")
print("\n" + "="*80 + "\n")

Titanic dataset loaded successfully!
DataFrame shape: (891, 12)


Initial display.max_rows setting: 60

--- Printing df_titanic with initial setting (may be truncated or full) ---
     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily Ma

In [None]:
'''
pd.reset_option("all"), is a handy command in Pandas to revert all configurable options to their factory defaults.

What does pd.reset_option("all") do?
It's a convenient way to clear any custom settings you've applied using pd.set_option().
It restores every Pandas option to the state it was in when Pandas was first imported.
This is useful for debugging, ensuring consistency, or simply getting back to a known good state after experimenting with various options.
Demonstration
Let's first set a few display options to non-default values and then use pd.reset_option("all") to revert them.
'''

In [14]:
import pandas as pd
import numpy as np

# --- 1. Show current default options (e.g., max_rows, max_columns, precision) ---
print("--- Initial Default Options ---")
print(f"display.max_rows: {pd.get_option('display.max_rows')}")
print(f"display.max_columns: {pd.get_option('display.max_columns')}")
print(f"display.precision: {pd.get_option('display.precision')}")
print("-" * 50)

# --- 2. Set some options to custom values ---
pd.set_option('display.max_rows', 10)       # Set a small number of rows
pd.set_option('display.max_columns', 5)      # Set a small number of columns
pd.set_option('display.precision', 2)       # Set float precision to 2 decimal places

print("\n--- Options after Custom Settings ---")
print(f"display.max_rows: {pd.get_option('display.max_rows')}")
print(f"display.max_columns: {pd.get_option('display.max_columns')}")
print(f"display.precision: {pd.get_option('display.precision')}")
print("-" * 50)

# Create a sample DataFrame to observe the effect
data = {f'col_{i}': np.random.rand(15) * 100 for i in range(10)}
df = pd.DataFrame(data)

print("\n--- DataFrame printed with Custom Settings (truncated and lower precision) ---")
print(df)
print("-" * 50)


# --- 3. Use pd.reset_option("all") ---
print("\n--- Applying pd.reset_option('all') ---")
pd.reset_option("all")
print("All Pandas options have been reset to their defaults.")
print("-" * 50)


# --- 4. Verify that options are back to defaults ---
print("\n--- Options after Resetting to All Defaults ---")
print(f"display.max_rows: {pd.get_option('display.max_rows')}")
print(f"display.max_columns: {pd.get_option('display.max_columns')}")
print(f"display.precision: {pd.get_option('display.precision')}")
print("-" * 50)

print("\n--- DataFrame printed with Default Settings (should be less truncated/higher precision) ---")
print(df)
print("-" * 50)

--- Initial Default Options ---
display.max_rows: 60
display.max_columns: 20
display.precision: 6
--------------------------------------------------

--- Options after Custom Settings ---
display.max_rows: 10
display.max_columns: 5
display.precision: 2
--------------------------------------------------

--- DataFrame printed with Custom Settings (truncated and lower precision) ---
    col_0  col_1  ...  col_8  col_9
0   59.87  57.70  ...  44.67   5.26
1   81.53  10.74  ...  89.99  69.28
2    8.49  36.88  ...  10.66  89.07
3   22.57   0.54  ...  70.93  13.34
4   63.87  53.26  ...  96.67  96.89
..    ...    ...  ...    ...    ...
10  89.31  90.95  ...  66.32  14.68
11  69.51  13.42  ...  97.35  31.33
12  51.27  16.67  ...  29.59  86.76
13  41.61  85.94  ...   5.77  37.36
14  49.86  55.97  ...  31.75  48.65

[15 rows x 10 columns]
--------------------------------------------------

--- Applying pd.reset_option('all') ---
All Pandas options have been reset to their defaults.
--------------

  pd.reset_option("all")
  pd.reset_option("all")


In [None]:
''' 

let's address warnings first, especially FutureWarning, and then discuss pd.set_option().

Suppressing Warnings
In Python, warnings are messages issued to the user about conditions that are not errors, but might indicate something that could be an issue in the future, or best practices that are not being followed. FutureWarning specifically indicates that a certain feature or behavior might change or be deprecated in future versions of a library.

While it's generally good practice to pay attention to warnings, sometimes you might want to suppress them, especially FutureWarning if you know about the impending change and it doesn't affect your current code.

You can ignore specific warnings using the warnings module in Python:

Python

import warnings
import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# --- Suppress FutureWarning ---
warnings.simplefilter(action='ignore', category=FutureWarning)

# Now, any code that might trigger a FutureWarning (e.g., some older Pandas operations
# that are now warning about future changes) will not display that warning.

try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully (FutureWarnings suppressed).")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# Example of a potential FutureWarning (depending on pandas version and operation):
# This specific line might not generate a FutureWarning in recent pandas versions,
# but demonstrates where you'd place code that *might* generate one.
# df_titanic['Age_Rounded'] = df_titanic['Age'].round()
# print(df_titanic[['Age', 'Age_Rounded']].head())

# To reset warnings to their default behavior:
# warnings.simplefilter(action='default', category=FutureWarning)
Explanation:

import warnings: Imports Python's built-in warnings module.
warnings.simplefilter(action='ignore', category=FutureWarning): This line tells the warnings module to ignore any FutureWarning messages.
action='ignore': Specifies that the warning should be suppressed.
category=FutureWarning: Targets only warnings of the FutureWarning type. You could also specify DeprecationWarning, UserWarning, etc., or omit category to ignore all warnings.
pd.set_option()
As we discussed before, pd.set_option() is Pandas' way of controlling global display and behavior settings. It allows you to customize how DataFrames are printed, how floats are displayed, how many columns are shown, and much more.

You use it by passing the option name (as a string) and the desired value.

Common pd.set_option() Uses:
Displaying More Rows:

Python

pd.set_option('display.max_rows', 100) # Show up to 100 rows
# Or to show all rows:
# pd.set_option('display.max_rows', None)
Displaying More Columns:

Python

pd.set_option('display.max_columns', 50) # Show up to 50 columns
# Or to show all columns:
# pd.set_option('display.max_columns', None)
Controlling Floating Point Precision:

Python

pd.set_option('display.precision', 4) # Show floats with 4 decimal places
Setting Float Format (e.g., for currency):

Python

# For general float formatting (e.g., 2 decimal places)
pd.set_option('display.float_format', '{:.2f}'.format)
# To reset this:
# pd.set_option('display.float_format', None)
Controlling Column Width:

Python

pd.set_option('display.width', 1000) # Sets the display width in characters
Example: Setting Multiple Options
Python

import pandas as pd
import numpy as np # For random data

# --- Set multiple options ---
pd.set_option('display.max_rows', 12)          # Max 12 rows
pd.set_option('display.max_columns', 8)        # Max 8 columns
pd.set_option('display.precision', 2)         # 2 decimal places for floats
pd.set_option('display.float_format', '{:,.2f}'.format) # Comma as thousands separator, 2 decimal places

print("--- Pandas Display Options Set ---")
print(f"Max Rows: {pd.get_option('display.max_rows')}")
print(f"Max Columns: {pd.get_option('display.max_columns')}")
print(f"Precision: {pd.get_option('display.precision')}")
print(f"Float Format: {pd.get_option('display.float_format')}")
print("-" * 50)

# Create a sample DataFrame to observe the effect
data = {f'col_{i}': np.random.rand(20) * 1000 for i in range(15)}
df_example = pd.DataFrame(data)

print("\n--- DataFrame with Custom Display Options ---")
print(df_example)
print("-" * 50)

# --- Reset all options to default ---
pd.reset_option('all')
print("\n--- All Pandas Options Reset to Default ---")
print("You can verify by printing df_example again, it will revert to default display.")
# print(df_example) # Uncomment to see the default display again.
Remember that pd.set_option() changes behavior globally for your current Python session. If you only need options changed for a specific block of code, pd.option_context() is a safer and cleaner approach, as it automatically reverts the settings.


'''

# CATEGORICAL Data Types

In [None]:
'''

In Pandas, a categorical data type is used for columns that contain a limited number of distinct values (categories), even if those values are strings. Think of things like gender, education level, city names, or yes/no flags.

Why use category dtype?
Memory Efficiency: For columns with many repeated string values, storing them as category can significantly reduce memory usage. Instead of storing the full string for each row, Pandas stores integers (codes) pointing to a unique list of categories.
Performance: Certain operations (like groupby(), value_counts(), and some plotting functions) can be much faster on categorical data.
Semantic Meaning: It explicitly tells Pandas (and anyone reading your code) that a column's values should be treated as discrete categories, not as arbitrary text or numbers. This is important for statistical modeling and proper data analysis.
Ordering: Categorical data can be ordered (e.g., 'Low' < 'Medium' < 'High'), which is useful for sorting and comparisons.
Example with Titanic Dataset
Let's use the 'Sex', 'Embarked', and 'Pclass' columns from the Titanic dataset to demonstrate categorical data types.

'''

In [15]:
import pandas as pd

# Load the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

print("\n--- Original DataFrame Info ---")
# Check the dtypes of relevant columns
df_titanic[['Sex', 'Embarked', 'Pclass']].info()
print("-" * 60)

# --- 1. Convert 'Sex' to 'category' dtype ---
# It's currently 'object' (string)
df_titanic['Sex'] = df_titanic['Sex'].astype('category')

print("\n--- After converting 'Sex' to 'category' ---")
print(f"dtype of 'Sex': {df_titanic['Sex'].dtype}")
print(f"Categories of 'Sex': {df_titanic['Sex'].cat.categories.tolist()}") # Access .cat accessor
print(f"Codes of 'Sex' (internal representation):\n{df_titanic['Sex'].cat.codes.head()}")
print("-" * 60)

# --- 2. Convert 'Embarked' to 'category' dtype ---
# It's also currently 'object'
df_titanic['Embarked'] = df_titanic['Embarked'].astype('category')

print("\n--- After converting 'Embarked' to 'category' ---")
print(f"dtype of 'Embarked': {df_titanic['Embarked'].dtype}")
print(f"Categories of 'Embarked': {df_titanic['Embarked'].cat.categories.tolist()}")
print("-" * 60)

# --- 3. Convert 'Pclass' to 'category' dtype ---
# 'Pclass' is currently int64, but represents categories (1st, 2nd, 3rd class)
df_titanic['Pclass'] = df_titanic['Pclass'].astype('category')

print("\n--- After converting 'Pclass' to 'category' ---")
print(f"dtype of 'Pclass': {df_titanic['Pclass'].dtype}")
print(f"Categories of 'Pclass': {df_titanic['Pclass'].cat.categories.tolist()}")
print("-" * 60)

print("\n--- DataFrame Info After Conversions ---")
df_titanic[['Sex', 'Embarked', 'Pclass']].info()
print("\nNotice the significant memory usage reduction for 'Sex' and 'Embarked'.")
print("For 'Pclass', it's about semantic meaning more than memory for small numbers.")
print("-" * 60)

# --- Common Operations with Categorical Data ---

# Faster value_counts()
print("\n--- Value Counts on Categorical 'Sex' ---")
print(df_titanic['Sex'].value_counts())
print("-" * 60)

# Accessing specific categorical attributes
print("\n--- Unique categories for 'Embarked' ---")
print(df_titanic['Embarked'].cat.categories) # Provides the unique categories
print("-" * 60)

# Ordering categories (optional, but powerful for plotting/sorting)
# Let's say we want Pclass to be ordered explicitly
df_titanic['Pclass_Ordered'] = df_titanic['Pclass'].cat.reorder_categories([1, 2, 3], ordered=True)
print("\n--- Pclass_Ordered (categorical with explicit order) ---")
print(df_titanic['Pclass_Ordered'].head())
print(f"Is Pclass_Ordered ordered? {df_titanic['Pclass_Ordered'].cat.ordered}")
print("-" * 60)

Titanic dataset loaded successfully!

--- Original DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sex       891 non-null    object
 1   Embarked  889 non-null    object
 2   Pclass    891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB
------------------------------------------------------------

--- After converting 'Sex' to 'category' ---
dtype of 'Sex': category
Categories of 'Sex': ['female', 'male']
Codes of 'Sex' (internal representation):
0    1
1    0
2    0
3    0
4    1
dtype: int8
------------------------------------------------------------

--- After converting 'Embarked' to 'category' ---
dtype of 'Embarked': category
Categories of 'Embarked': ['C', 'Q', 'S']
------------------------------------------------------------

--- After converting 'Pclass' to 'category' ---
dtype of 'Pclass': category
Ca

In [None]:
'''
When to Convert to category dtype:
When a column has a fixed and limited number of unique values.
When a column's values have a logical order (e.g., 'Small', 'Medium', 'Large').
To save memory for large datasets with many repeated strings.
To improve performance for groupby or value_counts operations.
To ensure proper statistical treatment or plotting for qualitative data.
By converting relevant columns to the category dtype, you make your DataFrame more efficient and semantically richer.

'''

In [None]:
'''
transform the 'Sex' column in your df_titanic DataFrame using a lambda function with apply(). This is a common way to encode categorical variables numerically, which is often required for machine learning models.

Specifically, you're mapping 'male' to 0 and anything else (which will be 'female' in this dataset) to 1.

'''

In [16]:
import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print("\n--- Original 'Sex' Column Head ---")
    print(df_titanic['Sex'].head(7))
    print("\nOriginal 'Sex' Value Counts:")
    print(df_titanic['Sex'].value_counts())
    print("-" * 60)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# Apply the lambda function to create a new 'Sex_Encoded' column
# lambda x: 0 if x == "male" else 1
# - For each value 'x' in the 'Sex' column:
# - If 'x' is exactly "male", return 0.
# - Otherwise (if 'x' is "female"), return 1.
df_titanic['Sex_Encoded'] = df_titanic['Sex'].apply(lambda x: 0 if x == "male" else 1)

print("\n--- DataFrame with 'Sex_Encoded' Column ---")
print(df_titanic[['Sex', 'Sex_Encoded']].head(7))

print("\n'Sex_Encoded' Value Counts:")
print(df_titanic['Sex_Encoded'].value_counts())
print("-" * 60)

# Verify the mapping for a couple of rows
print("\n--- Verifying a few specific rows ---")
print(df_titanic[df_titanic['Name'].isin(['Braund, Mr. Owen Harris', 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'])][['Name', 'Sex', 'Sex_Encoded']])

Titanic dataset loaded successfully!

--- Original 'Sex' Column Head ---
0      male
1    female
2    female
3    female
4      male
5      male
6      male
Name: Sex, dtype: object

Original 'Sex' Value Counts:
Sex
male      577
female    314
Name: count, dtype: int64
------------------------------------------------------------

--- DataFrame with 'Sex_Encoded' Column ---
      Sex  Sex_Encoded
0    male            0
1  female            1
2  female            1
3  female            1
4    male            0
5    male            0
6    male            0

'Sex_Encoded' Value Counts:
Sex_Encoded
0    577
1    314
Name: count, dtype: int64
------------------------------------------------------------

--- Verifying a few specific rows ---
                                                Name     Sex  Sex_Encoded
0                            Braund, Mr. Owen Harris    male            0
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female            1


In [None]:
'''

Explanation
df_titanic['Sex']: We select the 'Sex' column from your DataFrame.
.apply(...): This Pandas method applies a function (in this case, our lambda function) to each element of the Series.
lambda x: 0 if x == "male" else 1: This is an anonymous function:
x represents each individual value in the 'Sex' column ('male' or 'female').
0 if x == "male" else 1: This is a conditional expression. It checks if x is equal to the string "male".
If True (the value is "male"), it returns 0.
If False (the value is not "male", so it's "female"), it returns 1.
df_titanic['Sex_Encoded'] = ...: The results of applying the lambda function to each element are collected into a new Series, 
which is then assigned to a new column named 'Sex_Encoded' in your DataFrame.
This technique is super handy for converting categorical text data into numerical 
representations that many machine learning algorithms can understand and process.

'''

In [17]:
import pandas as pd

# Define the URL for the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Read the CSV file into a Pandas DataFrame
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# Get the unique values from the 'Pclass' column
pclass_unique_values = df_titanic['Pclass'].unique()

print("\n--- Unique values in the 'Pclass' column ---")
print(pclass_unique_values)

# You can also check the type of the result
print(f"\nType of the result: {type(pclass_unique_values)}")
print("-" * 60)

# For comparison, you can also use .value_counts() to get unique values
# along with their frequencies.
print("\n--- Value Counts for 'Pclass' (includes frequencies) ---")
print(df_titanic['Pclass'].value_counts())
print("-" * 60)

Titanic dataset loaded successfully!

--- Unique values in the 'Pclass' column ---
[3 1 2]

Type of the result: <class 'numpy.ndarray'>
------------------------------------------------------------

--- Value Counts for 'Pclass' (includes frequencies) ---
Pclass
3    491
1    216
2    184
Name: count, dtype: int64
------------------------------------------------------------


# Date Functionality in Pandas

In [None]:
'''
Pandas offers incredibly robust and versatile functionality for working with dates and times, making it a cornerstone for time series analysis. It primarily uses NumPy's datetime64 dtype, which Pandas wraps into its own Timestamp objects for individual points in time, and Timedelta objects for durations.

Here's a breakdown of key date functionalities:

1. The datetime dtype and Timestamp Objects
Pandas stores date and time information as datetime64 (often displayed as datetime64[ns], indicating nanosecond precision).
Each individual date/time point is a Timestamp object.
'''

In [18]:
'''
2. Converting to Datetime (pd.to_datetime())

This is the most crucial function. Pandas is very good at inferring formats, but you can also explicitly provide them for speed and robustness.

'''

import pandas as pd
import numpy as np

# Sample DataFrame with various date formats
data = {
    'Date_Str1': ['2023-01-15', '2023-02-28', '2023-03-10', '2023-04-05', '2023-05-20'],
    'Date_Str2': ['01/15/2023', '02/28/2023', '03/10/2023', '04/05/2023', '05/20/2023'],
    'Date_Str3': ['15-Jan-2023', '28-Feb-2023', '10-Mar-2023', '05-Apr-2023', '20-May-2023'],
    'Date_Time_Str': ['2023-01-15 10:30:00', '2023-02-28 14:00:00', '2023-03-10 09:15:00', '2023-04-05 11:45:00', '2023-05-20 18:20:00'],
    'Not_A_Date': ['apple', '2023-13-01', 'banana', '2023-02-30', 'orange'], # Invalid dates
    'Value': [10, 20, 15, 25, 30]
}
df = pd.DataFrame(data)

print("--- Original DataFrame Info ---")
df.info()
print("-" * 60)

# Convert columns to datetime
df['Date1'] = pd.to_datetime(df['Date_Str1'])
df['Date2'] = pd.to_datetime(df['Date_Str2'])
df['Date3'] = pd.to_datetime(df['Date_Str3'])
df['DateTime'] = pd.to_datetime(df['Date_Time_Str'])

# Handling errors:
# errors='coerce': Invalid parsing will be set as NaT (Not a Time).
# errors='ignore': Invalid parsing will return the input unchanged.
df['Date_Coerced'] = pd.to_datetime(df['Not_A_Date'], errors='coerce')
df['Date_Ignored'] = pd.to_datetime(df['Not_A_Date'], errors='ignore')

print("\n--- DataFrame Info After pd.to_datetime() ---")
df.info()
print("\n--- Converted Dates and Error Handling ---")
print(df[['Date_Str1', 'Date1', 'Date_Str2', 'Date2', 'Date_Str3', 'Date3', 'DateTime', 'Date_Coerced', 'Date_Ignored']].head())
print("-" * 60)

# Specifying format for faster parsing (if you know it)
# %Y: 4-digit year, %m: 2-digit month, %d: 2-digit day
# %H: hour (24-hour), %M: minute, %S: second
df['Date1_Formatted'] = pd.to_datetime(df['Date_Str1'], format='%Y-%m-%d')
print("\n--- Dates parsed with explicit format ---")
print(df[['Date_Str1', 'Date1_Formatted']].head())
print("-" * 60)

--- Original DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Date_Str1      5 non-null      object
 1   Date_Str2      5 non-null      object
 2   Date_Str3      5 non-null      object
 3   Date_Time_Str  5 non-null      object
 4   Not_A_Date     5 non-null      object
 5   Value          5 non-null      int64 
dtypes: int64(1), object(5)
memory usage: 372.0+ bytes
------------------------------------------------------------

--- DataFrame Info After pd.to_datetime() ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date_Str1      5 non-null      object        
 1   Date_Str2      5 non-null      object        
 2   Date_Str3      5 non-null      object        
 3   

  df['Date_Coerced'] = pd.to_datetime(df['Not_A_Date'], errors='coerce')
  df['Date_Ignored'] = pd.to_datetime(df['Not_A_Date'], errors='ignore')
  df['Date_Ignored'] = pd.to_datetime(df['Not_A_Date'], errors='ignore')


In [19]:
''' 
Creating Date Ranges (pd.date_range())
Useful for generating sequences of dates for time series data.

'''

# Daily frequency for 7 days starting from '2023-01-01'
date_range_daily = pd.date_range(start='2023-01-01', periods=7, freq='D')
print("\n--- Daily Date Range ---")
print(date_range_daily)

# Monthly frequency
date_range_monthly = pd.date_range(start='2023-01-01', end='2023-06-30', freq='MS') # MS for Month Start
print("\n--- Monthly Date Range (Month Start) ---")
print(date_range_monthly)

# Hourly frequency
date_range_hourly = pd.date_range(start='2023-01-01 09:00', periods=5, freq='H')
print("\n--- Hourly Date Range ---")
print(date_range_hourly)
print("-" * 60)


--- Daily Date Range ---
DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07'],
              dtype='datetime64[ns]', freq='D')

--- Monthly Date Range (Month Start) ---
DatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',
               '2023-05-01', '2023-06-01'],
              dtype='datetime64[ns]', freq='MS')

--- Hourly Date Range ---
DatetimeIndex(['2023-01-01 09:00:00', '2023-01-01 10:00:00',
               '2023-01-01 11:00:00', '2023-01-01 12:00:00',
               '2023-01-01 13:00:00'],
              dtype='datetime64[ns]', freq='h')
------------------------------------------------------------


  date_range_hourly = pd.date_range(start='2023-01-01 09:00', periods=5, freq='H')


In [21]:
'''
4. Accessing Date/Time Components (.dt accessor)
Once a Series is datetime dtype, you can use the .dt accessor to extract various components like year, month, day, day of week, hour, etc.
'''

# Using the 'DateTime' column we created earlier
print("\n--- Accessing Date/Time Components (.dt accessor) ---")
print(f"Year: {df['DateTime'].dt.year.head()}")
print(f"Month: {df['DateTime'].dt.month.head()}")
print(f"Day: {df['DateTime'].dt.day.head()}")
print(f"Hour: {df['DateTime'].dt.hour.head()}")
print(f"Day of Week (Monday=0): {df['DateTime'].dt.dayofweek.head()}")
print(f"Day Name: {df['DateTime'].dt.day_name().head()}")
print(f"Month Name: {df['DateTime'].dt.month_name().head()}")
print(f"Quarter: {df['DateTime'].dt.quarter.head()}")
print("-" * 60)


--- Accessing Date/Time Components (.dt accessor) ---
Year: 0    2023
1    2023
2    2023
3    2023
4    2023
Name: DateTime, dtype: int32
Month: 0    1
1    2
2    3
3    4
4    5
Name: DateTime, dtype: int32
Day: 0    15
1    28
2    10
3     5
4    20
Name: DateTime, dtype: int32
Hour: 0    10
1    14
2     9
3    11
4    18
Name: DateTime, dtype: int32
Day of Week (Monday=0): 0    6
1    1
2    4
3    2
4    5
Name: DateTime, dtype: int32
Day Name: 0       Sunday
1      Tuesday
2       Friday
3    Wednesday
4     Saturday
Name: DateTime, dtype: object
Month Name: 0     January
1    February
2       March
3       April
4         May
Name: DateTime, dtype: object
Quarter: 0    1
1    1
2    1
3    2
4    2
Name: DateTime, dtype: int32
------------------------------------------------------------


In [None]:
'''
5. Time Series Indexing
Setting a datetime column as the DataFrame's index enables powerful time-based indexing and slicing.
'''

In [22]:
df_time_indexed = df.set_index('DateTime')

print("\n--- DataFrame with DateTime Index ---")
print(df_time_indexed.head())

# Slicing by date
print("\n--- Slicing by Year (2023) ---")
print(df_time_indexed['2023']) # Selects all data for 2023

print("\n--- Slicing by Month (March 2023) ---")
print(df_time_indexed['2023-03'])

print("\n--- Slicing a Date Range ---")
print(df_time_indexed['2023-02-28':'2023-04-05'])
print("-" * 60)


--- DataFrame with DateTime Index ---
                      Date_Str1   Date_Str2    Date_Str3        Date_Time_Str  \
DateTime                                                                        
2023-01-15 10:30:00  2023-01-15  01/15/2023  15-Jan-2023  2023-01-15 10:30:00   
2023-02-28 14:00:00  2023-02-28  02/28/2023  28-Feb-2023  2023-02-28 14:00:00   
2023-03-10 09:15:00  2023-03-10  03/10/2023  10-Mar-2023  2023-03-10 09:15:00   
2023-04-05 11:45:00  2023-04-05  04/05/2023  05-Apr-2023  2023-04-05 11:45:00   
2023-05-20 18:20:00  2023-05-20  05/20/2023  20-May-2023  2023-05-20 18:20:00   

                     Not_A_Date  Value      Date1      Date2      Date3  \
DateTime                                                                  
2023-01-15 10:30:00       apple     10 2023-01-15 2023-01-15 2023-01-15   
2023-02-28 14:00:00  2023-13-01     20 2023-02-28 2023-02-28 2023-02-28   
2023-03-10 09:15:00      banana     15 2023-03-10 2023-03-10 2023-03-10   
2023-04-05 11:45:0

KeyError: '2023'

In [23]:
'''

6. Time Deltas (pd.Timedelta)
Represents a duration or difference between two datetime objects.

'''

# Calculate duration between two datetime columns
df['Duration'] = df['DateTime'] - df['Date1']
print("\n--- Time Duration (Timedelta) ---")
print(df[['DateTime', 'Date1', 'Duration']].head())
print(f"Type of Duration column: {df['Duration'].dtype}")

# Create a specific Timedelta
one_day = pd.Timedelta(days=1)
print(f"\nOne day Timedelta: {one_day}")

# Add Timedelta to a datetime
df['DateTime_Plus_1Day'] = df['DateTime'] + one_day
print("\n--- DateTime + 1 Day ---")
print(df[['DateTime', 'DateTime_Plus_1Day']].head())
print("-" * 60)


--- Time Duration (Timedelta) ---
             DateTime      Date1        Duration
0 2023-01-15 10:30:00 2023-01-15 0 days 10:30:00
1 2023-02-28 14:00:00 2023-02-28 0 days 14:00:00
2 2023-03-10 09:15:00 2023-03-10 0 days 09:15:00
3 2023-04-05 11:45:00 2023-04-05 0 days 11:45:00
4 2023-05-20 18:20:00 2023-05-20 0 days 18:20:00
Type of Duration column: timedelta64[ns]

One day Timedelta: 1 days 00:00:00

--- DateTime + 1 Day ---
             DateTime  DateTime_Plus_1Day
0 2023-01-15 10:30:00 2023-01-16 10:30:00
1 2023-02-28 14:00:00 2023-03-01 14:00:00
2 2023-03-10 09:15:00 2023-03-11 09:15:00
3 2023-04-05 11:45:00 2023-04-06 11:45:00
4 2023-05-20 18:20:00 2023-05-21 18:20:00
------------------------------------------------------------


In [24]:
'''
7. Resampling (Brief Mention)
For time series with a datetime index, resample() allows you to change the frequency of your data (e.g., from daily to monthly average, or hourly to daily sum).

'''
# Assuming df_time_indexed from above
# Resample to monthly frequency, taking the mean of 'Value'
if not df_time_indexed.empty: # Check if it's not empty after slicing
    df_resampled_monthly = df_time_indexed['Value'].resample('M').mean()
    print("\n--- Resampled Monthly Mean ('Value') ---")
    print(df_resampled_monthly)
    print("-" * 60)
else:
    print("\nDataFrame for resampling is empty.")
    print("-" * 60)


--- Resampled Monthly Mean ('Value') ---
DateTime
2023-01-31    10.0
2023-02-28    20.0
2023-03-31    15.0
2023-04-30    25.0
2023-05-31    30.0
Freq: ME, Name: Value, dtype: float64
------------------------------------------------------------


  df_resampled_monthly = df_time_indexed['Value'].resample('M').mean()


# groupby in pandas


In [None]:
'''

groupby() is one of the most powerful and frequently used operations in Pandas. It allows you to group rows of data together based on common values in one or more columns and then perform aggregate calculations or transformations on those groups.

It follows a "split-apply-combine" strategy:

Split: The DataFrame is divided into groups based on the values in the specified column(s).
Apply: A function (e.g., sum, mean, count, custom function) is applied to each individual group.
Combine: The results from each group are combined into a new DataFrame or Series.
Basic Syntax
You typically use df.groupby('column_name') or df.groupby(['column1', 'column2']) followed by an aggregation function.

Key Operations with groupby()
Once you've grouped your data, you can apply various operations:

Aggregation: Apply an aggregation function to each group, reducing each group to a single value per column (e.g., sum(), mean(), count(), min(), max(), median(), std()).
Transformation: Apply a function to each group that returns an object of the same size as the group. This is useful for normalizing data within groups or filling missing values based on group statistics.
Filtration: Filter out groups based on some criterion.
Example with Titanic Dataset
Let's use the Titanic dataset to understand how groupby() works.

'''

'groupby() is one of the most powerful and frequently used operations in Pandas. It allows you to group rows of data together based on common values in one or more columns and then perform aggregate calculations or transformations on those groups.\n\nIt follows a "split-apply-combine" strategy:\n\nSplit: The DataFrame is divided into groups based on the values in the specified column(s).\nApply: A function (e.g., sum, mean, count, custom function) is applied to each individual group.\nCombine: The results from each group are combined into a new DataFrame or Series.\nBasic Syntax\nYou typically use df.groupby(\'column_name\') or df.groupby([\'column1\', \'column2\']) followed by an aggregation function.\n\nKey Operations with groupby()\nOnce you\'ve grouped your data, you can apply various operations:\n\nAggregation: Apply an aggregation function to each group, reducing each group to a single value per column (e.g., sum(), mean(), count(), min(), max(), median(), std()).\nTransformation

In [26]:
import pandas as pd

# Load the Titanic dataset
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
try:
    df_titanic = pd.read_csv(titanic_dataset_github_url)
    print("Titanic dataset loaded successfully!")
    print("\n--- Original DataFrame Head ---")
    print(df_titanic[['Sex', 'Pclass', 'Age', 'Fare', 'Survived']].head())
    print("-" * 70)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    print("Please check the URL or your internet connection.")

# --- 1. Group by a single column ('Sex') and calculate the mean of numerical columns ---
# This will calculate the mean for 'Age', 'Fare', 'Survived', etc., for males and females separately.
grouped_by_sex = df_titanic.groupby('Sex').mean(numeric_only=True) # numeric_only=True to avoid warning for non-numeric cols

print("\n--- Mean values grouped by 'Sex' ---")
print(grouped_by_sex[['Age', 'Fare', 'Survived']])
print("\nInterpretation: On average, females survived more, and paid higher fares.")
print("-" * 70)

# --- 2. Group by multiple columns ('Sex' and 'Pclass') and calculate the mean ---
# This creates subgroups for each combination of Sex and Pclass (e.g., male 1st class, female 2nd class).
grouped_by_sex_pclass = df_titanic.groupby(['Sex', 'Pclass']).mean(numeric_only=True)

print("\n--- Mean values grouped by 'Sex' and 'Pclass' ---")
print(grouped_by_sex_pclass[['Age', 'Fare', 'Survived']])
print("\nInterpretation: Survival rates vary significantly by both sex and class.")
print("-" * 70)

# --- 3. Applying specific aggregations to specific columns (.agg()) ---
# Calculate mean age and total fare for each sex
agg_results = df_titanic.groupby('Sex').agg(
    Avg_Age=('Age', 'mean'),
    Total_Fare=('Fare', 'sum'),
    Survival_Rate=('Survived', 'mean') # Mean of 0s and 1s gives proportion
)

print("\n--- Specific aggregations using .agg() ---")
print(agg_results)
print("-" * 70)

# --- 4. Using .size() to count items in each group ---
group_sizes = df_titanic.groupby(['Sex', 'Pclass']).size()
print("\n--- Number of passengers in each Sex/Pclass group (.size()) ---")
print(group_sizes)
print("-" * 70)

# --- 5. Transformation (brief example: filling missing 'Age' with group mean) ---
# Fill missing 'Age' values with the mean age of their respective Pclass
# Note: This modifies the 'Age' column based on group means
print("\n--- Age (original vs. filled by Pclass mean) ---")
# Count NaNs before transformation
original_age_nan_count = df_titanic['Age'].isnull().sum()
print(f"NaNs in Age before transformation: {original_age_nan_count}")

df_titanic['Age_Filled_by_Pclass_Mean'] = df_titanic.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.mean()))

# Count NaNs after transformation
filled_age_nan_count = df_titanic['Age_Filled_by_Pclass_Mean'].isnull().sum()
print(f"NaNs in Age after transformation: {filled_age_nan_count}")

# Show some rows where Age was NaN and is now filled
print("\nExamples of Age filled by Pclass mean (Original Age vs. New Age):")
# Filter for rows where original Age was NaN and show both columns
print(df_titanic[df_titanic['Age'].isnull()][['Age', 'Age_Filled_by_Pclass_Mean', 'Pclass']].head())
print("-" * 70)

Titanic dataset loaded successfully!

--- Original DataFrame Head ---
      Sex  Pclass   Age     Fare  Survived
0    male       3  22.0   7.2500         0
1  female       1  38.0  71.2833         1
2  female       3  26.0   7.9250         1
3  female       1  35.0  53.1000         1
4    male       3  35.0   8.0500         0
----------------------------------------------------------------------

--- Mean values grouped by 'Sex' ---
              Age       Fare  Survived
Sex                                   
female  27.915709  44.479818  0.742038
male    30.726645  25.523893  0.188908

Interpretation: On average, females survived more, and paid higher fares.
----------------------------------------------------------------------

--- Mean values grouped by 'Sex' and 'Pclass' ---
                     Age        Fare  Survived
Sex    Pclass                                 
female 1       34.611765  106.125798  0.968085
       2       28.722973   21.970121  0.921053
       3       21.7500

# Pandas Joins

In [None]:
'''
Pandas joins, primarily performed using the pd.merge() function, are used to combine two or more DataFrames based on common columns or indices (keys). This is analogous to JOIN operations in SQL databases.

The goal is to bring together related information that is spread across different tables (DataFrames) into a single, more comprehensive table.

The Core Function: pd.merge()
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, suffixes=('_x', '_y'), ...)

Key Parameters:

left, right: The two DataFrames you want to join.
how: Specifies the type of join. This is the most crucial parameter, defining which rows are kept from the joined DataFrames.
'inner' (default): Keeps only the rows where the join key(s) exist in both DataFrames. This is the most restrictive join.
'left': Keeps all rows from the left DataFrame and matching rows from the right. If there's no match in the right DataFrame, NaNs are filled for the right DataFrame's columns.
'right': Keeps all rows from the right DataFrame and matching rows from the left. If there's no match in the left DataFrame, NaNs are filled for the left DataFrame's columns.
'outer': Keeps all rows from both DataFrames. If a key is present in one DataFrame but not the other, NaNs are filled for the missing DataFrame's columns.
on: Column or list of column names to join on. These columns must exist in both DataFrames.
left_on, right_on: Column or list of column names in the left and right DataFrames respectively to join on, when column names are different in the two DataFrames.
suffixes: A tuple of strings to append to overlapping column names (not join keys) from the left and right DataFrames, respectively.
Simple Example
Let's create two small DataFrames: one for employee names and IDs, and another for employee IDs and their departments.

'''

In [27]:
import pandas as pd

# DataFrame 1: Employee Names
df_employees = pd.DataFrame({
    'EmpID': [101, 102, 103, 104, 105],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve']
})

# DataFrame 2: Employee Departments and Salaries
df_departments = pd.DataFrame({
    'Employee_ID': [102, 103, 105, 106, 107], # Note: Different Emp IDs here
    'Department': ['HR', 'Finance', 'IT', 'Marketing', 'Sales'],
    'Salary': [60000, 75000, 80000, 50000, 65000]
})

print("--- df_employees ---")
print(df_employees)
print("\n--- df_departments ---")
print(df_departments)
print("-" * 50)

# --- 1. Inner Join ---
# Only employees present in BOTH DataFrames will be included.
# Joining on 'EmpID' from df_employees and 'Employee_ID' from df_departments
inner_join_df = pd.merge(df_employees, df_departments, left_on='EmpID', right_on='Employee_ID', how='inner')

print("\n--- Inner Join (matching IDs in both) ---")
print(inner_join_df)
print("\nExplanation: Only EmpIDs 102, 103, 105 exist in both DataFrames.")
print("-" * 50)

# --- 2. Left Join ---
# All employees from df_employees (left), plus matching department info.
# Non-matching will have NaN for department columns.
left_join_df = pd.merge(df_employees, df_departments, left_on='EmpID', right_on='Employee_ID', how='left')

print("\n--- Left Join (all from left, matching from right) ---")
print(left_join_df)
print("\nExplanation: All employees from df_employees are kept. David (104) has NaN for department/salary.")
print("-" * 50)

# --- 3. Right Join ---
# All departments from df_departments (right), plus matching employee names.
# Non-matching will have NaN for name columns.
right_join_df = pd.merge(df_employees, df_departments, left_on='EmpID', right_on='Employee_ID', how='right')

print("\n--- Right Join (all from right, matching from left) ---")
print(right_join_df)
print("\nExplanation: All departments from df_departments are kept. Marketing (106) and Sales (107) have NaN for name.")
print("-" * 50)

# --- 4. Outer Join ---
# All rows from both DataFrames, with NaNs where no match exists.
outer_join_df = pd.merge(df_employees, df_departments, left_on='EmpID', right_on='Employee_ID', how='outer')

print("\n--- Outer Join (all from both) ---")
print(outer_join_df)
print("\nExplanation: All unique IDs from both DataFrames are included, with NaNs where there's no match.")
print("-" * 50)

--- df_employees ---
   EmpID     Name
0    101    Alice
1    102      Bob
2    103  Charlie
3    104    David
4    105      Eve

--- df_departments ---
   Employee_ID Department  Salary
0          102         HR   60000
1          103    Finance   75000
2          105         IT   80000
3          106  Marketing   50000
4          107      Sales   65000
--------------------------------------------------

--- Inner Join (matching IDs in both) ---
   EmpID     Name  Employee_ID Department  Salary
0    102      Bob          102         HR   60000
1    103  Charlie          103    Finance   75000
2    105      Eve          105         IT   80000

Explanation: Only EmpIDs 102, 103, 105 exist in both DataFrames.
--------------------------------------------------

--- Left Join (all from left, matching from right) ---
   EmpID     Name  Employee_ID Department   Salary
0    101    Alice          NaN        NaN      NaN
1    102      Bob        102.0         HR  60000.0
2    103  Charlie      

In [None]:
# In brief, pd.merge() is your go-to function for combining DataFrames, 
# and the how parameter is key to controlling which data is retained during the merge operation.