In [None]:
# 1. pd.read_csv()
import pandas as pd

# Parameters:
# filepath_or_buffer (str): Path or URL of the CSV file.
# sep (str, optional): Field delimiter. Default is ','.
# header (int or list of int, optional): Row number(s) to use as column names.
# index_col (int, str, sequence of int / str, or False, optional): Column(s) to set as index.

# Load a CSV file into a DataFrame
df = pd.read_csv('data.csv')

# Display the first few rows
print(df.head())
# Output: DataFrame showing first 5 rows of the CSV data with column headers.

In [None]:
# 2. pd.DataFrame()
# Parameters:
# data (array-like, dict, or DataFrame): The data to be stored in the DataFrame.
# index (array-like, optional): The index (row labels).
# columns (array-like, optional): Column labels.

# Create a DataFrame from a dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [24, 27, 22],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)
# Output: DataFrame with 3 rows containing Name, Age, and City data.

In [None]:
# 3. .head()
# Parameters:
# n (int): Number of rows to display from the top. Default is 5.

# Display the first few rows of a DataFrame
print(df.head(3))  # Shows the first 3 rows
# Output: First 3 rows of the DataFrame.

In [None]:
# 4. .tail()
# Parameters:
# n (int): Number of rows to display from the bottom. Default is 5.

# Display the last few rows of a DataFrame
print(df.tail(2))  # Shows the last 2 rows
# Output: Last 2 rows of the DataFrame.

In [None]:
# 5. .info()
# Parameters: None

# Provides a concise summary of the DataFrame
df.info()
# Output: DataFrame information including index, column names, non-null counts, and memory usage.

In [None]:
# 6. .describe()
# Parameters:
# percentiles (list-like of numbers, optional): Percentiles to include in the output.
# include (str or list-like, optional): Specify data types to include (e.g., 'all').
# exclude (str or list-like, optional): Specify data types to exclude.

# Provides summary statistics for numerical columns
print(df.describe())
# Output: Statistical summary of numerical columns (count, mean, std, min, 25%, 50%, 75%, max).

In [None]:
# 7. .shape
# Parameters: None

# Returns the dimensions of the DataFrame (rows, columns)
print(df.shape)  # Output example: (3, 3) means 3 rows, 3 columns
# Output: Tuple showing dimensions (e.g., (3, 3) for 3 rows and 3 columns).

In [None]:
# 8. .columns
# Parameters: None

# Returns the column names of the DataFrame
print(df.columns)
# Output: Index object containing column names (e.g., Index(['Name', 'Age', 'City'], dtype='object')).

In [None]:
# 9. .dtypes
# Parameters: None

# Returns the data types of each column in the DataFrame
print(df.dtypes)
# Output: Series showing data type of each column (e.g., Name: object, Age: int64, City: object).

In [None]:
# 10. .isnull()
# Parameters: None

# Checks for missing values and returns a DataFrame of booleans
print(df.isnull())
# Output: DataFrame of same shape with boolean values (True for null, False for non-null).

In [None]:
# 11. .fillna()
# Parameters:
# value (scalar, dict, Series, or DataFrame): Value to replace NaNs with.
# method (str, optional): Method to use for filling holes ('backfill', 'bfill', 'pad', 'ffill').
# axis (int or str, optional): Axis along which to fill (0 for rows, 1 for columns).
# inplace (bool, optional): Modify the DataFrame in place (default is False).

# Fill missing values with a specified value
df_filled = df.fillna(0)
print(df_filled)
# Output: DataFrame with missing values replaced by specified value (0 in this case).

In [None]:
# 12. .dropna()
# Parameters:
# axis (int or str, optional): Axis along which to drop rows or columns (0 for rows, 1 for columns).
# how (str, optional): 'any' (default) drops if any NaN values; 'all' drops if all values are NaN.
# inplace (bool, optional): Modify the DataFrame in place (default is False).
# subset (array-like, optional): Labels of columns to consider for dropping NA.

# Drop rows with missing values
df_dropped = df.dropna()
print(df_dropped)
# Output: DataFrame with rows containing null values removed.

In [None]:
# 13. .groupby()
# Parameters:
# by (str or list of str): Column(s) to group by.
# axis (int or str, optional): Axis along which to group.
# as_index (bool, optional): If True, the group labels become index; otherwise, group labels are columns.

# Group by a column and calculate the mean for each group
grouped = df.groupby('City').mean()
print(grouped)
# Output: DataFrame showing mean values for each unique city.

In [None]:
# 14. .apply()
# Parameters:
# func (function): The function to apply to each element.
# axis (int, optional): 0 for applying function to each column, 1 for each row.
# result_type (str, optional): 'expand', 'reduce', 'broadcast' (default is None).

# Apply a function to each column
df['Age_plus_10'] = df['Age'].apply(lambda x: x + 10)
print(df)
# Output: DataFrame with new 'Age_plus_10' column containing Age values increased by 10.

In [None]:
# 15. .loc[]
# Parameters:
# row_label(s): Row label(s) to select.
# column_label(s): Column label(s) to select.
# Supports slicing and Boolean indexing.

# Select specific rows and columns by labels
print(df.loc[0:1, ['Name', 'City']])  # Selects the first two rows and specific columns
# Output: DataFrame subset with first two rows and Name/City columns.

In [None]:
# 16. .iloc[]
# Parameters:
# row_position(s): Row index/position(s) to select.
# column_position(s): Column index/position(s) to select.
# Supports integer-based slicing.

# Select specific rows and columns by position
print(df.iloc[0:2, 0:2])  # Selects the first two rows and the first two columns
# Output: DataFrame subset with first two rows and first two columns.

In [None]:
# 17. .merge()
# Parameters:
# right (DataFrame): The DataFrame to merge with.
# how (str, optional): Type of merge ('left', 'right', 'outer', 'inner'). Default is 'inner'.
# on (label or list, optional): Column(s) to join on. If None, uses common columns.
# left_on/right_on (label or list, optional): Columns to join on from the left and right DataFrames.

# Merge two DataFrames on a common column
df1 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']})
df2 = pd.DataFrame({'ID': [1, 2, 3], 'Age': [24, 27, 22]})
merged_df = pd.merge(df1, df2, on='ID')
print(merged_df)
# Output: DataFrame combining df1 and df2 based on 'ID' column.

In [None]:
# 18. .concat()
# Parameters:
# objs (list of DataFrames): List of DataFrames to concatenate.
# axis (int, optional): Axis along which to concatenate (0 for rows, 1 for columns). Default is 0.
# join (str, optional): 'inner' for intersection or 'outer' for union. Default is 'outer'.

# Concatenate two DataFrames along rows
df1 = pd.DataFrame({'Name': ['Alice', 'Bob']})
df2 = pd.DataFrame({'Name': ['Charlie', 'David']})
concat_df = pd.concat([df1, df2], axis=0)
print(concat_df)
# Output: DataFrame combining df1 and df2 vertically with reset index.

In [None]:
# 19. .sort_values()
# Parameters:
# by (str or list of str): Column(s) to sort by.
# axis (int, optional): Axis along which to sort (0 for rows, 1 for columns). Default is 0.
# ascending (bool or list of bool, optional): Sort in ascending order or not. Default is True.
# inplace (bool, optional): Modify the DataFrame in place. Default is False.

# Sort DataFrame by a column
sorted_df = df.sort_values(by='Age', ascending=False)
print(sorted_df)
# Output: DataFrame sorted by Age column in descending order.

In [None]:
# 20. .pivot_table()
# Parameters:
# values (str or list, optional): Column(s) to aggregate.
# index (str or list): Column(s) to group by as the new index.
# columns (str or list, optional): Column(s) to pivot into new columns.
# aggfunc (function or list, default 'mean'): Aggregation function to apply.

# Create a pivot table
pivot_df = df.pivot_table(values='Age', index='City', aggfunc='mean')
print(pivot_df)
# Output: DataFrame showing mean Age values for each unique City.

In [None]:
# 21. .rename()
# Parameters:
# mapper (dict or function): Dictionary or function to rename columns/index.
# axis (int or str, optional): Specifies whether to rename columns (1 or 'columns') or index (0 or 'index').
# inplace (bool, optional): Whether to modify the DataFrame in place. Default is False.

# Rename columns in the DataFrame
df_renamed = df.rename(columns={'Name': 'Full Name', 'Age': 'Years'})
print(df_renamed)
# Output: DataFrame with renamed columns ('Name' to 'Full Name', 'Age' to 'Years').

In [None]:
# 22. .value_counts()
# Parameters:
# normalize (bool, optional): If True, returns the relative frequencies.
# sort (bool, optional): If True, sorts values in descending order. Default is True.
# ascending (bool, optional): If True, sorts in ascending order.

# Count occurrences of each unique value in a column
counts = df['City'].value_counts()
print(counts)
# Output: Series showing frequency of each unique City value.

In [None]:
# 23. .sample()
# Parameters:
# n (int, optional): Number of items to sample.
# frac (float, optional): Fraction of items to sample.
# replace (bool, optional): Whether to sample with replacement.
# random_state (int, optional): Seed for the random number generator.

# Sample 2 random rows from the DataFrame
sampled_df = df.sample(n=2)
print(sampled_df)
# Output: DataFrame containing 2 randomly selected rows.

In [None]:
# 24. .astype()
# Parameters:
# dtype (str, dtype, or dict): The type to cast each column to.
# errors (str, optional): 'raise' to raise an error if casting fails; 'ignore' to do nothing.

# Convert a column to a different data type
df['Age'] = df['Age'].astype(float)
print(df.dtypes)
# Output: Series showing updated data types with Age column as float64.

In [None]:
# 25. .drop()
# Parameters:
# labels (str or list): Index or column labels to drop.
# axis (int or str, optional): Axis along which to drop labels (0 for rows, 1 for columns).
# inplace (bool, optional): Whether to modify the DataFrame in place. Default is False.

# Drop a column from the DataFrame
df_dropped = df.drop(columns=['City'])
print(df_dropped)
# Output: DataFrame with City column removed.

In [None]:
# 26. .duplicated()
# Parameters:
# subset (str or list-like, optional): Columns to check for duplicates.
# keep (str, optional): 'first', 'last', or False to mark duplicates.
# Returns a boolean Series indicating duplicated rows.

# Check for duplicate rows based on a column
duplicates = df.duplicated(subset=['Name'])
print(duplicates)
# Output: Boolean Series indicating which rows have duplicate Name values.

In [None]:
# 27. .corr()
# Parameters:
# method (str, optional): Correlation method ('pearson', 'kendall', 'spearman').
# min_periods (int, optional): Minimum number of observations required.

# Calculate the correlation matrix for numerical columns
correlation_matrix = df.corr()
print(correlation_matrix)
# Output: DataFrame showing correlation coefficients between numerical columns.

In [None]:
# 28. .reset_index()
# Parameters:
# drop (bool, optional): If True, does not add the index as a column.
# inplace (bool, optional): Modify the DataFrame in place. Default is False.

# Reset the index of the DataFrame
df_reset = df.reset_index(drop=True)
print(df_reset)
# Output: DataFrame with reset sequential integer index.

In [None]:
# 29. .nlargest()
# Parameters:
# n (int): Number of top items to select.
# columns (str or list): Column(s) to use for sorting.

# Select the top 2 rows with the highest values in the 'Age' column
top_ages = df.nlargest(2, 'Age')
print(top_ages)
# Output: DataFrame with 2 rows having the highest Age values.

In [None]:
# 30. .nsmallest()
# Parameters:
# n (int): Number of smallest items to select.
# columns (str or list): Column(s) to use for sorting.

# Select the top 2 rows with the smallest values in the 'Age' column
smallest_ages = df.nsmallest(2, 'Age')
print(smallest_ages)
# Output: DataFrame with 2 rows having the lowest Age values.

In [None]:
# 31. .set_index()
# Parameters:
# keys (str or array-like): Column(s) to set as index.
# drop (bool, optional): Whether to drop the column(s) from the DataFrame. Default is True.
# inplace (bool, optional): Modify the DataFrame in place. Default is False.

# Set 'Name' column as the index
df_indexed = df.set_index('Name')
print(df_indexed)
# Output: DataFrame with 'Name' column as the new index.

In [None]:
# 32. .sort_index()
# Parameters:
# axis (int, optional): Axis to sort (0 for rows, 1 for columns).
# ascending (bool, optional): Sort ascending or descending. Default is True.
# inplace (bool, optional): Modify the DataFrame in place. Default is False.

# Sort DataFrame by index
sorted_by_index = df_indexed.sort_index()
print(sorted_by_index)
# Output: DataFrame sorted by the index (e.g., alphabetical order of 'Name' if set as index).

In [None]:
# 33. .cumsum()
# Parameters:
# axis (int, optional): Axis along which to calculate cumulative sum (0 for index, 1 for columns).
# skipna (bool, optional): Exclude NA/null values. Default is True.

# Calculate cumulative sum for each numeric column
cumsum_df = df[['Age']].cumsum()
print(cumsum_df)
# Output: DataFrame with cumulative sum of values in 'Age' column.

In [None]:
# 34. .cumprod()
# Parameters:
# axis (int, optional): Axis along which to calculate cumulative product (0 for index, 1 for columns).
# skipna (bool, optional): Exclude NA/null values. Default is True.

# Calculate cumulative product for each numeric column
cumprod_df = df[['Age']].cumprod()
print(cumprod_df)
# Output: DataFrame with cumulative product of values in 'Age' column.

In [None]:
# 35. .diff()
# Parameters:
# periods (int, optional): Number of periods to calculate difference. Default is 1.
# axis (int, optional): Axis along which to calculate difference. Default is 0.

# Calculate the difference between consecutive values in the 'Age' column
diff_df = df[['Age']].diff()
print(diff_df)
# Output: DataFrame showing the difference between each row and the previous row in 'Age' column.

In [None]:
# 36. .rank()
# Parameters:
# axis (int, optional): Axis to rank along. Default is 0.
# method (str, optional): Ranking method ('average', 'min', 'max', 'first', 'dense'). Default is 'average'.
# ascending (bool, optional): Rank in ascending order. Default is True.

# Rank values in the 'Age' column
rank_df = df[['Age']].rank()
print(rank_df)
# Output: DataFrame with rank of each value in 'Age' column.

In [None]:
# 37. .explode()
# Parameters:
# column (str): Column to explode.
# ignore_index (bool, optional): If True, reset index after exploding. Default is False.

# Example with a DataFrame containing a list in one column
df_explode = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Hobbies': [['Reading', 'Cooking'], ['Sports']]})
exploded_df = df_explode.explode('Hobbies')
print(exploded_df)
# Output: DataFrame with one row per element in the 'Hobbies' list, duplicating 'Name' values.

In [None]:
# 38. .transpose()
# Parameters: None

# Transpose the DataFrame (swap rows and columns)
transposed_df = df.transpose()
print(transposed_df)
# Output: DataFrame with rows and columns swapped (rows become columns and vice versa).

In [None]:
# 39. .memory_usage()
# Parameters:
# index (bool, optional): If True, include memory usage of the DataFrame's index. Default is True.
# deep (bool, optional): If True, analyze deep memory usage of object dtype columns.

# Check memory usage of each column
memory_usage = df.memory_usage(deep=True)
print(memory_usage)
# Output: Series showing memory usage of each column in bytes.

In [None]:
# 40. .query()
# Parameters:
# expr (str): The query string (boolean expression to filter rows).
# inplace (bool, optional): Modify the DataFrame in place. Default is False.

# Filter rows based on a condition
query_df = df.query('Age > 25')
print(query_df)
# Output: DataFrame with rows where 'Age' is greater than 25.

In [None]:
# 41. .pivot()
# Parameters:
# index (str or list): Column(s) to use as row labels.
# columns (str or list): Column(s) to use as new columns.
# values (str, optional): Column to use for populating values.

# Pivot data to restructure the DataFrame
pivoted_df = df.pivot(index='Name', columns='City', values='Age')
print(pivoted_df)
# Output: DataFrame where 'Name' is the index, unique 'City' values are columns, and 'Age' values fill the cells.

In [None]:
# 42. .melt()
# Parameters:
# id_vars (str or list): Column(s) to keep as identifier variables.
# value_vars (str or list, optional): Column(s) to unpivot.
# var_name (str, optional): Name for the variable column.
# value_name (str, optional): Name for the value column.

# Unpivot the DataFrame from wide to long format
melted_df = df.melt(id_vars=['Name'], value_vars=['Age', 'City'], var_name='Attribute', value_name='Value')
print(melted_df)
# Output: DataFrame in long format with 'Name' as identifier and 'Age'/'City' values in 'Attribute' and 'Value' columns.

In [None]:
# 43. .isin()
# Parameters:
# values (list, Series, dict, or DataFrame): Values to check for presence.

# Filter rows based on whether values are in a specified list
filtered_df = df[df['City'].isin(['New York', 'Chicago'])]
print(filtered_df)
# Output: DataFrame with rows where 'City' is either 'New York' or 'Chicago'.

In [None]:
# 44. .nunique()
# Parameters:
# axis (int, optional): Axis along which to count unique values (0 for rows, 1 for columns).
# dropna (bool, optional): Whether to include NaN in the count. Default is True.

# Get the number of unique values in each column
unique_counts = df.nunique()
print(unique_counts)
# Output: Series showing the count of unique values in each column.

In [None]:
# 45. .select_dtypes()
# Parameters:
# include (str, list-like, or None): Data types to include.
# exclude (str, list-like, or None): Data types to exclude.

# Select columns with a specific data type (e.g., numeric)
numeric_df = df.select_dtypes(include='number')
print(numeric_df)
# Output: DataFrame with only numeric columns.

In [None]:
# 46. .agg()
# Parameters:
# func (function, str, list, or dict): Aggregation function(s) to apply.
# axis (int, optional): Axis along which to aggregate. Default is 0.

# Perform multiple aggregations on a column
agg_df = df[['Age']].agg(['mean', 'min', 'max'])
print(agg_df)
# Output: DataFrame with 'mean', 'min', and 'max' of 'Age' column.

In [None]:
# 47. .applymap()
# Parameters:
# func (function): Function to apply to each element of the DataFrame.

# Apply a function to each element of the DataFrame
formatted_df = df[['Age']].applymap(lambda x: f"{x} years")
print(formatted_df)
# Output: DataFrame where each value in 'Age' column is formatted as a string (e.g., "24 years").

In [None]:
# 48. .idxmax()
# Parameters:
# axis (int, optional): Axis along which to find the index. Default is 0.
# skipna (bool, optional): Exclude NA/null values. Default is True.

# Get the index of the maximum value in each column
max_index = df['Age'].idxmax()
print(max_index)
# Output: Index of the row with the highest value in 'Age' column.

In [None]:
# 49. .idxmin()
# Parameters:
# axis (int, optional): Axis along which to find the index. Default is 0.
# skipna (bool, optional): Exclude NA/null values. Default is True.

# Get the index of the minimum value in each column
min_index = df['Age'].idxmin()
print(min_index)
# Output: Index of the row with the lowest value in 'Age' column.

In [None]:
# 50. .where()
# Parameters:
# cond (boolean Series/DataFrame): Condition to evaluate.
# other (scalar, Series, or DataFrame, optional): Value(s) to use where the condition is False.
# inplace (bool, optional): Modify the DataFrame in place. Default is False.

# Replace values in 'Age' column with NaN where 'Age' is less than 25
filtered_age_df = df['Age'].where(df['Age'] >= 25)
print(filtered_age_df)
# Output: Series with 'Age' values under 25 replaced with NaN.

In [None]:
# 51. .applymap()
# Parameters:
# func (function): A function to apply to each element of the DataFrame.

# Apply a function to each element of the DataFrame (e.g., format each value as a string)
formatted_df = df.applymap(lambda x: f"{x} units" if isinstance(x, (int, float)) else x)
print(formatted_df)
# Output: DataFrame with each numeric element formatted as a string (e.g., "24 units").

In [None]:
# 52. .at[]
# Parameters:
# label (scalar): Row label.
# column (scalar): Column label.

# Access a specific element by row and column labels
value = df.at[0, 'Age']
print(value)
# Output: The value in the first row of the 'Age' column.

In [None]:
# 53. .iat[]
# Parameters:
# row_position (int): Row index.
# column_position (int): Column index.

# Access a specific element by integer position
value = df.iat[0, 1]  # First row, second column
print(value)
# Output: The value at the specified row and column position.

In [None]:
# 54. .any()
# Parameters:
# axis (int, optional): Axis to check for True values. Default is 0.
# skipna (bool, optional): Exclude NA/null values. Default is True.

# Check if any value in each column is True (non-zero)
any_true = df.any()
print(any_true)
# Output: Series showing True if any value in each column is True.

In [None]:
# 55. .all()
# Parameters:
# axis (int, optional): Axis to check if all values are True. Default is 0.
# skipna (bool, optional): Exclude NA/null values. Default is True.

# Check if all values in each column are True (non-zero)
all_true = df.all()
print(all_true)
# Output: Series showing True if all values in each column are True.

In [None]:
# 56. .combine_first()
# Parameters:
# other (DataFrame): The other DataFrame to combine with.

# Fill missing values in a DataFrame with values from another DataFrame
df1 = pd.DataFrame({'A': [1, None, 3], 'B': [4, 5, None]})
df2 = pd.DataFrame({'A': [7, 8, 9], 'B': [10, 11, 12]})
combined_df = df1.combine_first(df2)
print(combined_df)
# Output: DataFrame with missing values in `df1` filled by values from `df2`.

In [None]:
# 57. .duplicated()
# Parameters:
# subset (str or list-like, optional): Columns to check for duplicates.
# keep (str, optional): 'first', 'last', or False to mark duplicates.

# Check for duplicated rows
duplicates = df.duplicated(subset='Name', keep=False)
print(duplicates)
# Output: Series indicating True for duplicated rows based on the 'Name' column.

In [None]:
# 58. .equals()
# Parameters:
# other (DataFrame): The other DataFrame to compare.

# Check if two DataFrames are equal
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
are_equal = df1.equals(df2)
print(are_equal)
# Output: True if `df1` and `df2` are exactly the same; otherwise, False.

In [None]:
# 59. .set_axis()
# Parameters:
# labels (array-like): The new labels to set.
# axis (int, optional): The axis to set labels for (0 for rows, 1 for columns).
# inplace (bool, optional): Modify the DataFrame in place. Default is False.

# Set new column names
df_set_axis = df.set_axis(['Col1', 'Col2', 'Col3'], axis=1, inplace=False)
print(df_set_axis)
# Output: DataFrame with updated column names.

In [None]:
# 60. .squeeze()
# Parameters:
# axis (None or int, optional): Squeeze along specified axis if possible.

# Convert a single-column or single-row DataFrame to a Series
single_col_df = pd.DataFrame({'A': [1, 2, 3]})
squeezed_series = single_col_df.squeeze()
print(squeezed_series)
# Output: Series if the DataFrame has only one column or row.

In [None]:
# 61. .copy()
# Parameters:
# deep (bool, optional): Whether to copy underlying data. Default is False.

# Create a copy of the DataFrame
df_copy = df.copy(deep=True)
print(df_copy)
# Output: A deep copy of the original DataFrame.

## Data Cleaning

In [None]:
# 1. Forward Filling (`.fillna(method='ffill')`)
# Parameters:
# method (str, optional): Specifies the fill method ('ffill' for forward fill, 'bfill' for backward fill).
# axis (int, optional): Axis along which to fill (0 for index/rows, 1 for columns). Default is 0.
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Sample DataFrame with missing values
df = pd.DataFrame({'A': [1, None, None, 4, 5]})

# Forward fill missing values
df_ffill = df.fillna(method='ffill')
print(df_ffill)
# Output: DataFrame with missing values filled using the last known value.
#     A
# 0  1.0
# 1  1.0
# 2  1.0
# 3  4.0
# 4  5.0

In [None]:
# 2. Backward Filling (`.fillna(method='bfill')`)
# Parameters:
# method (str, optional): Specifies the fill method ('ffill' for forward fill, 'bfill' for backward fill).
# axis (int, optional): Axis along which to fill (0 for index/rows, 1 for columns). Default is 0.
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Backward fill missing values
df_bfill = df.fillna(method='bfill')
print(df_bfill)
# Output: DataFrame with missing values filled using the next known value.
#     A
# 0  1.0
# 1  4.0
# 2  4.0
# 3  4.0
# 4  5.0

In [None]:
# 3. Clipping Values (`.clip()`)
# Parameters:
# lower (float or int, optional): Minimum threshold value. All values below this will be set to `lower`.
# upper (float or int, optional): Maximum threshold value. All values above this will be set to `upper`.
# axis (int, optional): Axis along which to clip values. Default is None (applies to all values).

# Clip values in column 'A' to be within the range [2, 4]
df_clipped = df.clip(lower=2, upper=4)
print(df_clipped)
# Output: DataFrame with values clipped to the specified range.
#     A
# 0  2.0
# 1  NaN
# 2  NaN
# 3  4.0
# 4  4.0

In [None]:
# 4. Winsorizing (Using `scipy.stats.mstats.winsorize`)
# Parameters:
# limits (tuple of floats): Limits on both sides, as a proportion (e.g., (0.05, 0.05) limits the bottom 5% and top 5%).
# inclusive (tuple of bools, optional): Whether to include values exactly equal to the limits. Default is (True, True).

from scipy.stats.mstats import winsorize

# Winsorize values in column 'A' to limit extreme values
df['A'] = winsorize(df['A'], limits=[0.05, 0.05])
print(df)
# Output: DataFrame with extreme values at the tails limited to the specified percentiles.

In [None]:
# 5. Converting a String Date into a Datetime Object (`pd.to_datetime()`)
# Parameters:
# arg (str, list-like, Series, or DataFrame): Dates to convert.
# format (str, optional): Specify the date format (e.g., '%Y-%m-%d') if known to improve speed.
# errors (str, optional): 'raise' to raise an error, 'coerce' to set invalid parsing as NaT, 'ignore' to skip parsing errors.

# Sample DataFrame with string dates
df_dates = pd.DataFrame({'Date': ['2023-01-01', '2023-02-15', '2023-03-20']})

# Convert to datetime format
df_dates['Date'] = pd.to_datetime(df_dates['Date'])
print(df_dates)
# Output: DataFrame with 'Date' column converted to datetime objects.
#         Date
# 0 2023-01-01
# 1 2023-02-15
# 2 2023-03-20

In [None]:
# 6. Changing a Column to Become the Index (`.set_index()`)
# Parameters:
# keys (str or list): Column(s) to set as the index.
# drop (bool, optional): Whether to drop the column(s) from the DataFrame. Default is True.
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Set 'Date' column as the index
df_dates_indexed = df_dates.set_index('Date')
print(df_dates_indexed)
# Output: DataFrame with 'Date' as the index.
# Empty DataFrame
# Columns: []
# Index: [2023-01-01, 2023-02-15, 2023-03-20]

In [None]:
# 7. Resetting the Index to a Column (`.reset_index()`)
# Parameters:
# drop (bool, optional): If True, do not add the index as a column. Default is False.
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Reset the index
df_reset = df_dates_indexed.reset_index()
print(df_reset)
# Output: DataFrame with the index reset and added back as a column.
#         Date
# 0 2023-01-01
# 1 2023-02-15
# 2 2023-03-20

In [None]:
# 8. Changing an Object Column to Integer or Float (`.astype()`)
# Parameters:
# dtype (str or dict): Type to cast each column to (e.g., 'int', 'float').
# errors (str, optional): 'raise' to raise an error if casting fails, 'ignore' to skip casting errors.

# Sample DataFrame with numeric data as object
df_types = pd.DataFrame({'Value': ['10', '20', '30']})

# Convert to integer
df_types['Value'] = df_types['Value'].astype(int)
print(df_types)
# Output: DataFrame with 'Value' column converted to integer type.
#    Value
# 0     10
# 1     20
# 2     30

In [None]:
# 9. Replacing Specific Values in a DataFrame (`.replace()`)
# Parameters:
# to_replace (scalar, list, dict, or Series): Value(s) to replace.
# value (scalar, list, dict, or Series): Value(s) to replace with.
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Replace specific values in a DataFrame
df_replace = df_types.replace(10, 100)
print(df_replace)
# Output: DataFrame with specified values replaced.
#    Value
# 0    100
# 1     20
# 2     30

In [None]:
# 10. Dropping Rows with Missing Values (`.dropna()`)
# Parameters:
# axis (int, optional): Axis to drop from (0 for rows, 1 for columns). Default is 0.
# how (str, optional): 'any' to drop if any NA values are present, 'all' to drop only if all values are NA. Default is 'any'.
# subset (list, optional): Columns to consider for NA values when dropping.

# Drop rows with missing values
df_dropped = df.dropna()
print(df_dropped)
# Output: DataFrame with rows containing NA values dropped.

In [None]:
# 11. Removing Duplicates (`.drop_duplicates()`)
# Parameters:
# subset (str or list, optional): Columns to consider when identifying duplicates.
# keep (str, optional): 'first' to keep the first occurrence, 'last' to keep the last, False to drop all duplicates.
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Sample DataFrame with duplicate rows
df_duplicates = pd.DataFrame({'A': [1, 2, 2, 3], 'B': ['x', 'y', 'y', 'z']})

# Drop duplicate rows
df_no_duplicates = df_duplicates.drop_duplicates()
print(df_no_duplicates)
# Output: DataFrame with duplicate rows removed

In [None]:
# 12. Renaming Columns (`.rename()`)
# Parameters:
# mapper (dict or function): Dictionary or function to rename columns/index.
# axis (int or str, optional): Axis along which to rename (0 for rows, 1 for columns).
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Rename columns
df_renamed = df_duplicates.rename(columns={'A': 'Alpha', 'B': 'Beta'})
print(df_renamed)
# Output: DataFrame with columns renamed.
#    Alpha Beta
# 0      1    x
# 1      2    y
# 2      2    y
# 3      3    z

In [None]:
# 13. Using a Lambda Function with `.apply()`
# Parameters:
# func (function): Function to apply to each element, row, or column.
# axis (int, optional): Axis along which to apply the function (0 for columns, 1 for rows).

# Example: Create a new column with double the value in column 'A'
df_duplicates['A_doubled'] = df_duplicates['A'].apply(lambda x: x * 2)
print(df_duplicates)
# Output: DataFrame with a new column where each value in 'A' is doubled.
#    A  B  A_doubled
# 0  1  x          2
# 1  2  y          4
# 2  2  y          4
# 3  3  z          6

In [None]:
# 14. Replacing Values with Conditions (`.where()`)
# Parameters:
# cond (boolean array-like): Condition to evaluate for each element.
# other (scalar or DataFrame): Value(s) to set where the condition is False.
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Replace values in 'A' that are less than 3 with NaN
df_where = df_duplicates['A'].where(df_duplicates['A'] >= 3, other=None)
print(df_where)
# Output: Series with values less than 3 replaced with NaN.
# 0    NaN
# 1    NaN
# 2    NaN
# 3    3.0

In [None]:
# 15. Converting Strings to Lowercase (`.str.lower()`)
# Parameters: None (operates on a Series of string data)

# Convert all strings in column 'B' to lowercase
df_lower = df_duplicates['B'].str.lower()
print(df_lower)
# Output: Series with all strings in 'B' converted to lowercase.
# 0    x
# 1    y
# 2    y
# 3    z

In [None]:
# 16. Splitting a String Column (`.str.split()`)
# Parameters:
# pat (str, optional): String or regular expression to split on. Default is whitespace.
# expand (bool, optional): If True, return DataFrame with each split element in a separate column.

# Sample DataFrame with a column of concatenated strings
df_split = pd.DataFrame({'Names': ['Alice-Bob', 'Charlie-Dan']})

# Split 'Names' column on '-' and expand into separate columns
df_split_expanded = df_split['Names'].str.split('-', expand=True)
print(df_split_expanded)
# Output: DataFrame with split elements in separate columns.
#         0       1
# 0   Alice     Bob
# 1 Charlie     Dan

In [None]:
# 17. Removing Leading and Trailing Whitespace (`.str.strip()`)
# Parameters: None (operates on a Series of string data)

# Sample DataFrame with leading and trailing whitespace
df_whitespace = pd.DataFrame({'Name': [' Alice ', 'Bob ', ' Charlie']})

# Strip leading and trailing whitespace
df_whitespace['Name'] = df_whitespace['Name'].str.strip()
print(df_whitespace)
# Output: DataFrame with whitespace removed.
#       Name
# 0    Alice
# 1      Bob
# 2  Charlie

In [None]:
# 18. Changing a Column's Data Type to Category (`.astype('category')`)
# Parameters:
# dtype (str, dtype, or dict): Type to cast each column to (e.g., 'category').

# Sample DataFrame with string data that can be categorized
df_category = pd.DataFrame({'City': ['New York', 'Los Angeles', 'Chicago']})

# Convert 'City' column to categorical type
df_category['City'] = df_category['City'].astype('category')
print(df_category.dtypes)
# Output: DataFrame with 'City' column as a categorical type.
# City    category
# dtype: object

In [None]:
# 19. Binning Numerical Data (`pd.cut()`)
# Parameters:
# x (array-like): Data to bin.
# bins (int, sequence of scalars, or IntervalIndex): Number of bins or specific bin edges.
# labels (array or bool, optional): Labels for the bins. If False, returns bin codes.

# Sample DataFrame with numeric data
df_numeric = pd.DataFrame({'Score': [15, 25, 35, 45, 55]})

# Bin 'Score' column into three bins
df_binned = pd.cut(df_numeric['Score'], bins=3, labels=['Low', 'Medium', 'High'])
df_numeric['Category'] = df_binned
print(df_numeric)
# Output: DataFrame with 'Score' column binned into categories.
#    Score Category
# 0     15      Low
# 1     25      Low
# 2     35   Medium
# 3     45   Medium
# 4     55     High

In [None]:
# 20. Dropping Columns (`.drop()`)
# Parameters:
# labels (str or list): Index or column labels to drop.
# axis (int, optional): 0 for rows, 1 for columns. Default is 0.
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Drop a column from the DataFrame
df_dropped_column = df_numeric.drop(columns=['Category'])
print(df_dropped_column)
# Output: DataFrame with 'Category' column removed.
#    Score
# 0     15
# 1     25
# 2     35
# 3     45
# 4     55

In [None]:
# 21. Filling Missing Values with Specific Values (`.fillna()`)
# Parameters:
# value (scalar, dict, Series, or DataFrame): Value(s) to replace NaNs with.
# method (str, optional): Fill method ('ffill' for forward, 'bfill' for backward).
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Sample DataFrame with missing values
df_missing = pd.DataFrame({'A': [1, None, 3], 'B': [4, 5, None]})

# Fill missing values with a specific value
df_filled = df_missing.fillna(value=0)
print(df_filled)
# Output: DataFrame with missing values filled with 0.
#     A    B
# 0  1.0  4.0
# 1  0.0  5.0
# 2  3.0  0.0

In [None]:
# 22. Dropping Rows Based on Condition (`.drop()` with `.index[]`)
# Parameters:
# labels (str or list): Index or column labels to drop.
# axis (int, optional): 0 for rows, 1 for columns. Default is 0.
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Drop rows where 'A' column is less than 2
df_dropped_rows = df_missing.drop(df_missing[df_missing['A'] < 2].index)
print(df_dropped_rows)
# Output: DataFrame with rows dropped based on a condition.
#     A    B
# 0  1.0  4.0
# 2  3.0  NaN

In [None]:
# 23. Creating Bins with Labels (`pd.cut()`)
# Parameters:
# x (array-like): Data to bin.
# bins (int, sequence of scalars, or IntervalIndex): Number of bins or specific bin edges.
# labels (array or bool, optional): Labels for the bins. If False, returns bin codes.
# right (bool, optional): Indicates whether bins include the right edge. Default is True.

# Sample DataFrame with numerical data
df_scores = pd.DataFrame({'Score': [10, 20, 30, 40, 50, 60, 70, 80, 90]})

# Create labeled bins for 'Score' column
df_scores['Category'] = pd.cut(df_scores['Score'], bins=[0, 30, 60, 100], labels=['Low', 'Medium', 'High'])
print(df_scores)
# Output: DataFrame with scores categorized into 'Low', 'Medium', 'High' based on bins.
#    Score Category
# 0     10     Low
# 1     20     Low
# 2     30     Low
# 3     40  Medium
# 4     50  Medium
# 5     60  Medium
# 6     70    High
# 7     80    High
# 8     90    High

In [None]:
# 24. Converting Numeric Bins into Bin Codes (`pd.cut()` with `labels=False`)
# Parameters:
# x (array-like): Data to bin.
# bins (int, sequence of scalars, or IntervalIndex): Number of bins or specific bin edges.
# labels (array or bool, optional): If False, returns bin codes instead of labels.

# Bin 'Score' into integer codes instead of labels
df_scores['Category_Code'] = pd.cut(df_scores['Score'], bins=[0, 30, 60, 100], labels=False)
print(df_scores)
# Output: DataFrame with 'Score' column categorized into integer codes (0, 1, 2).
#    Score Category  Category_Code
# 0     10     Low              0
# 1     20     Low              0
# 2     30     Low              0
# 3     40  Medium              1
# 4     50  Medium              1
# 5     60  Medium              1
# 6     70    High              2
# 7     80    High              2
# 8     90    High              2

In [None]:
# 25. Mapping Values to a New Scale (`.map()`)
# Parameters:
# arg (dict, Series, or function): Mapping correspondence or function.

# Map 'Category' values to numerical scores
category_mapping = {'Low': 1, 'Medium': 2, 'High': 3}
df_scores['Category_Score'] = df_scores['Category'].map(category_mapping)
print(df_scores)
# Output: DataFrame with 'Category' mapped to scores (Low = 1, Medium = 2, High = 3).
#    Score Category  Category_Code  Category_Score
# 0     10     Low              0               1
# 1     20     Low              0               1
# 2     30     Low              0               1
# 3     40  Medium              1               2
# 4     50  Medium              1               2
# 5     60  Medium              1               2
# 6     70    High              2               3
# 7     80    High              2               3
# 8     90    High              2               3

In [None]:
# 26. Checking for Missing Values (`.isnull()`)
# Parameters: None

# Check for missing values in the DataFrame
missing_values = df_missing.isnull()
print(missing_values)
# Output: DataFrame with boolean values indicating missing (True) or non-missing (False) values.
#        A      B
# 0  False  False
# 1   True  False
# 2  False   True

In [None]:
# 27. Counting Missing Values (`.isnull().sum()`)
# Parameters: None

# Count missing values in each column
missing_counts = df_missing.isnull().sum()
print(missing_counts)
# Output: Series with count of missing values per column.
# A    1
# B    1

In [None]:
# 28. Renaming Index (`.rename_axis()`)
# Parameters:
# mapper (str, optional): New name for the axis.
# axis (int or str, optional): Axis to rename ('index' or 'columns'). Default is 'index'.

# Rename the index in the DataFrame
df_renamed_index = df_missing.rename_axis('Row ID').reset_index()
print(df_renamed_index)
# Output: DataFrame with the index renamed as 'Row ID'.
#    Row ID    A    B
# 0       0  1.0  4.0
# 1       1  NaN  5.0
# 2       2  3.0  NaN

In [None]:
# 29. Assigning New Columns (`.assign()`)
# Parameters:
# **kwargs: New column names and values to assign (e.g., column='value').

# Assign a new column based on a calculation
df_assigned = df_scores.assign(Scaled_Score=lambda x: x['Score'] * 10)
print(df_assigned)
# Output: DataFrame with new column 'Scaled_Score' added as 10x 'Score' values.
#    Score Category  Category_Code  Category_Score  Scaled_Score
# 0     10     Low              0               1           100
# 1     20     Low              0               1           200
# 2     30     Low              0               1           300
# 3     40  Medium              1               2           400
# 4     50  Medium              1               2           500
# 5     60  Medium              1               2           600
# 6     70    High              2               3           700
# 7     80    High              2               3           800
# 8     90    High              2               3           900

In [None]:
# 30. Dropping Rows with All Missing Values (`.dropna(how='all')`)
# Parameters:
# axis (int, optional): Axis along which to drop rows or columns (0 for rows, 1 for columns). Default is 0.
# how (str, optional): 'any' to drop if any NA values present, 'all' to drop only if all values are NA.
# inplace (bool, optional): If True, modifies the DataFrame in place. Default is False.

# Drop rows where all values are missing
df_all_na = pd.DataFrame({'A': [None, None, 3], 'B': [None, 5, None]})
df_dropped_all_na = df_all_na.dropna(how='all')
print(df_dropped_all_na)
# Output: DataFrame with rows dropped where all values are NaN.
#     A    B