In [2]:
import pandas as pd

# Create a sample DataFrame
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, 2, 3, 4, None],
    'C': [1, None, None, 4, 5]
}

df = pd.DataFrame(data)

# Step 2: Check for missing values
missing_values = df.isnull()

# Step 3: Summarize missing data
missing_summary = df.isnull().sum()

# Print the missing values summary
print("Missing values in each column:")
print(missing_summary)

Missing values in each column:
A    1
B    2
C    2
dtype: int64


In [3]:
# Question: Dropping Rows with Missing Values
# Description: Practice the deletion method by removing rows with any missing values from a dataset.

# Steps to follow:
# 1. Use dropna() method: Use the dropna() method to remove rows with missing values.

import pandas as pd

# Create a sample DataFrame with missing values
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, 2, 3, 4, None],
    'C': [1, None, None, 4, 5]
}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Step 1: Use dropna() to remove rows with any missing values
df_cleaned = df.dropna()

# Display the DataFrame after dropping rows with missing values
print("\nDataFrame after dropping rows with missing values:")
print(df_cleaned)

Original DataFrame:
     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  NaN
2  NaN  3.0  NaN
3  4.0  4.0  4.0
4  5.0  NaN  5.0

DataFrame after dropping rows with missing values:
     A    B    C
3  4.0  4.0  4.0


In [4]:
# Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.

# Steps to follow:
# 1. Use dropna() with axis parameter: Set axis=1 in dropna() to remove columns with missing values.

import pandas as pd

# Create a sample DataFrame with missing values
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, 2, 3, 4, None],
    'C': [1, None, None, 4, 5]
}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Step 1: Use dropna() with axis=1 to remove columns with missing values
df_cleaned = df.dropna(axis=1)

# Display the DataFrame after dropping columns with missing values
print("\nDataFrame after dropping columns with missing values:")
print(df_cleaned)

Original DataFrame:
     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  NaN
2  NaN  3.0  NaN
3  4.0  4.0  4.0
4  5.0  NaN  5.0

DataFrame after dropping columns with missing values:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [5]:
# Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.

# Steps to follow:
# 1. Calculate mean and fill NA: Use mean() to calculate and fillna() to fill the missing values.

import pandas as pd

# Create a sample DataFrame with missing values
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, 2, 3, 4, None],
    'C': [1, 2, 3, 4, 5]
}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Step 1: Calculate the mean of the numerical column 'A'
mean_A = df['A'].mean()

# Step 2: Fill missing values in column 'A' with the mean value
df['A'] = df['A'].fillna(mean_A)

# Step 3: Fill missing values in column 'B' with its mean value (optional for demonstration)
mean_B = df['B'].mean()
df['B'] = df['B'].fillna(mean_B)

# Display the DataFrame after mean imputation
print("\nDataFrame after mean imputation:")
print(df)


Original DataFrame:
     A    B  C
0  1.0  NaN  1
1  2.0  2.0  2
2  NaN  3.0  3
3  4.0  4.0  4
4  5.0  NaN  5

DataFrame after mean imputation:
     A    B  C
0  1.0  3.0  1
1  2.0  2.0  2
2  3.0  3.0  3
3  4.0  4.0  4
4  5.0  3.0  5


In [6]:
# Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.

# Steps to follow:
# 1. Calculate mode and fill NA: Use mode() to find the most frequent value and fillna() to fill the missing values.

import pandas as pd

# Create a sample DataFrame with categorical data and missing values
data = {
    'Category': ['A', 'B', 'A', 'C', None, 'B', 'A', None, 'C', 'C']
}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Step 1: Calculate the mode of the 'Category' column
mode_value = df['Category'].mode()[0]  # mode() returns a series, so use [0] to get the first mode

# Step 2: Fill missing values in the 'Category' column with the mode value
df['Category'] = df['Category'].fillna(mode_value)

# Display the DataFrame after mode imputation
print("\nDataFrame after mode imputation:")
print(df)

Original DataFrame:
  Category
0        A
1        B
2        A
3        C
4     None
5        B
6        A
7     None
8        C
9        C

DataFrame after mode imputation:
  Category
0        A
1        B
2        A
3        C
4        A
5        B
6        A
7        A
8        C
9        C


In [7]:
# Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.

# Steps to follow:
# 1. Calculate median and fill NA: Use median() for skewed data and fillna() to handle missing values.

import pandas as pd

# Create a sample DataFrame with skewed data and missing values
data = {
    'Value': [1, 2, 3, 100, 5, None, 7, 8, 9, None]
}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Step 1: Calculate the median of the 'Value' column
median_value = df['Value'].median()

# Step 2: Fill missing values in the 'Value' column with the median value
df['Value'] = df['Value'].fillna(median_value)

# Display the DataFrame after median imputation
print("\nDataFrame after median imputation:")
print(df)

Original DataFrame:
   Value
0    1.0
1    2.0
2    3.0
3  100.0
4    5.0
5    NaN
6    7.0
7    8.0
8    9.0
9    NaN

DataFrame after median imputation:
   Value
0    1.0
1    2.0
2    3.0
3  100.0
4    5.0
5    6.0
6    7.0
7    8.0
8    9.0
9    6.0


In [8]:
# Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.

# Steps to follow:
# 1. Install and import required libraries: Use pip install sklearn if not already installed.
# 2. KNN Imputer: Use KNNImputer to fill in missing values.
import pandas as pd
from sklearn.impute import KNNImputer

# Create a sample DataFrame with missing values
data = {
    'A': [1, 2, None, 4, 5],
    'B': [None, 2, 3, 4, None],
    'C': [7, None, 9, 10, 11]
}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Step 1: Initialize KNNImputer with a specific number of neighbors
knn_imputer = KNNImputer(n_neighbors=2)  # We choose 2 neighbors for this example

# Step 2: Apply KNN imputation to fill missing values
df_imputed = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

# Display the DataFrame after KNN imputation
print("\nDataFrame after KNN imputation:")
print(df_imputed)


Original DataFrame:
     A    B     C
0  1.0  NaN   7.0
1  2.0  2.0   NaN
2  NaN  3.0   9.0
3  4.0  4.0  10.0
4  5.0  NaN  11.0

DataFrame after KNN imputation:
     A    B     C
0  1.0  2.5   7.0
1  2.0  2.0   8.0
2  3.0  3.0   9.0
3  4.0  4.0  10.0
4  5.0  3.5  11.0


In [9]:
# Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.

# Steps to follow:
# 1. Identify missing values in categorical data: Use the isnull() method on categorical columns.
# 2. Impute with next frequent category: Use the mode() method to choose the next frequent category.

import pandas as pd

# Sample dataset with categorical data
data = {
    'Category': ['A', 'B', 'C', 'B', None, 'A', 'C', None, 'A', 'B']
}

# Create DataFrame
df = pd.DataFrame(data)

# Display original DataFrame
print("Original DataFrame:")
print(df)

# Step 1: Identify missing values
missing_data = df['Category'].isnull()
print("\nMissing data in 'Category' column:")
print(missing_data)

# Step 2: Find the mode (most frequent value) and next frequent category
mode_value = df['Category'].mode()[0]  # Most frequent category
next_frequent_value = df['Category'].value_counts().index[1]  # Second most frequent category

# Step 3: Impute missing values with the next frequent category
df['Category'].fillna(next_frequent_value, inplace=True)

# Display DataFrame after imputation
print("\nDataFrame after imputing missing values with the next frequent category:")
print(df)

Original DataFrame:
  Category
0        A
1        B
2        C
3        B
4     None
5        A
6        C
7     None
8        A
9        B

Missing data in 'Category' column:
0    False
1    False
2    False
3    False
4     True
5    False
6    False
7     True
8    False
9    False
Name: Category, dtype: bool

DataFrame after imputing missing values with the next frequent category:
  Category
0        A
1        B
2        C
3        B
4        B
5        A
6        C
7        B
8        A
9        B


In [10]:
# Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.

# Steps to follow:
# 1. Partition the data: Split the dataset into train and test based on the presence of missing values.
# 2. Train a model: Use a regression model to predict missing values.
# 3. Impute missing values with predictions.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample dataset with numerical data, including missing values
data = {
    'Feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Feature2': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'Target': [21, 22, None, 24, 25, None, 27, 28, 29, 30]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display original DataFrame
print("Original DataFrame with missing values in 'Target' column:")
print(df)

# Step 1: Partition the data into train and test based on missing values in the 'Target' column
train_data = df[df['Target'].notna()]  # Rows where 'Target' is not missing
test_data = df[df['Target'].isna()]  # Rows where 'Target' is missing

# Step 2: Train a regression model (Linear Regression in this case)
X_train = train_data[['Feature1', 'Feature2']]  # Independent features
y_train = train_data['Target']  # Dependent feature (Target)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 3: Predict the missing 'Target' values
X_test = test_data[['Feature1', 'Feature2']]  # Independent features for the test set
predicted_values = model.predict(X_test)

# Step 4: Impute missing 'Target' values with the predictions
df.loc[df['Target'].isna(), 'Target'] = predicted_values

# Display DataFrame after imputation
print("\nDataFrame after imputing missing 'Target' values:")
print(df)


Original DataFrame with missing values in 'Target' column:
   Feature1  Feature2  Target
0         1        11    21.0
1         2        12    22.0
2         3        13     NaN
3         4        14    24.0
4         5        15    25.0
5         6        16     NaN
6         7        17    27.0
7         8        18    28.0
8         9        19    29.0
9        10        20    30.0

DataFrame after imputing missing 'Target' values:
   Feature1  Feature2  Target
0         1        11    21.0
1         2        12    22.0
2         3        13    23.0
3         4        14    24.0
4         5        15    25.0
5         6        16    26.0
6         7        17    27.0
7         8        18    28.0
8         9        19    29.0
9        10        20    30.0


In [11]:
# Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.

# Steps to follow:
# 1. Sort the data: Ensure the dataset is sorted by dates.
# 2. Use fillna() with method parameter: Apply ffill() and bfill() for forward and backward fill.

import pandas as pd

# Sample time series data with missing values
data = {
    'Date': pd.date_range(start='2023-01-01', periods=10, freq='D'),
    'Value': [10, None, 15, None, 20, None, None, 25, None, 30]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame with missing values:")
print(df)

# Step 1: Sort the data by 'Date' to ensure the time series order
df = df.sort_values(by='Date')

# Step 2: Forward fill (ffill) and Backward fill (bfill)
# Forward fill: Fill missing values with the previous available value
df['Value_ffill'] = df['Value'].fillna(method='ffill')

# Backward fill: Fill missing values with the next available value
df['Value_bfill'] = df['Value'].fillna(method='bfill')

# Display DataFrame after imputation
print("\nDataFrame after forward and backward fill:")
print(df)


Original DataFrame with missing values:
        Date  Value
0 2023-01-01   10.0
1 2023-01-02    NaN
2 2023-01-03   15.0
3 2023-01-04    NaN
4 2023-01-05   20.0
5 2023-01-06    NaN
6 2023-01-07    NaN
7 2023-01-08   25.0
8 2023-01-09    NaN
9 2023-01-10   30.0

DataFrame after forward and backward fill:
        Date  Value  Value_ffill  Value_bfill
0 2023-01-01   10.0         10.0         10.0
1 2023-01-02    NaN         10.0         15.0
2 2023-01-03   15.0         15.0         15.0
3 2023-01-04    NaN         15.0         20.0
4 2023-01-05   20.0         20.0         20.0
5 2023-01-06    NaN         20.0         25.0
6 2023-01-07    NaN         20.0         25.0
7 2023-01-08   25.0         25.0         25.0
8 2023-01-09    NaN         25.0         30.0
9 2023-01-10   30.0         30.0         30.0


  df['Value_ffill'] = df['Value'].fillna(method='ffill')
  df['Value_bfill'] = df['Value'].fillna(method='bfill')
