In [2]:
import pandas as pd
import numpy as np

# Sample dataset with missing values
data = {'A': [1, 2, np.nan, 4, 5],
        'B': [5, np.nan, np.nan, 8, 10],
        'C': [10, 20, 30, 40, 50]}

df = pd.DataFrame(data)

In [None]:

# Mean Imputation
df_mean_imputed = df.copy()
df_mean_imputed['A'].fillna(df_mean_imputed['A'].mean(), inplace=True)
df_mean_imputed['B'].fillna(df_mean_imputed['B'].mean(), inplace=True)

# Median Imputation
df_median_imputed = df.copy()
df_median_imputed['A'].fillna(df_median_imputed['A'].median(), inplace=True)
df_median_imputed['B'].fillna(df_median_imputed['B'].median(), inplace=True)

# Forward Fill
df_forward_filled = df.copy()
df_forward_filled.fillna(method='ffill', inplace=True)

# Backward Fill
df_backward_filled = df.copy()
df_backward_filled.fillna(method='bfill', inplace=True)

# Remove Records with Missing Values
df_dropped = df.dropna()

# Display the original and modified dataframes
print("Original DataFrame:\n", df)
print("\nDataFrame with Mean Imputation:\n", df_mean_imputed)
print("\nDataFrame with Median Imputation:\n", df_median_imputed)
print("\nDataFrame with Forward Fill:\n", df_forward_filled)
print("\nDataFrame with Backward Fill:\n", df_backward_filled)
print("\nDataFrame with Dropped Missing Values:\n", df_dropped)

Original DataFrame:
      A     B   C
0  1.0   5.0  10
1  2.0   NaN  20
2  NaN   NaN  30
3  4.0   8.0  40
4  5.0  10.0  50

DataFrame with Mean Imputation:
      A          B   C
0  1.0   5.000000  10
1  2.0   7.666667  20
2  3.0   7.666667  30
3  4.0   8.000000  40
4  5.0  10.000000  50

DataFrame with Median Imputation:
      A     B   C
0  1.0   5.0  10
1  2.0   8.0  20
2  3.0   8.0  30
3  4.0   8.0  40
4  5.0  10.0  50

DataFrame with Forward Fill:
      A     B   C
0  1.0   5.0  10
1  2.0   5.0  20
2  2.0   5.0  30
3  4.0   8.0  40
4  5.0  10.0  50

DataFrame with Backward Fill:
      A     B   C
0  1.0   5.0  10
1  2.0   8.0  20
2  4.0   8.0  30
3  4.0   8.0  40
4  5.0  10.0  50

DataFrame with Dropped Missing Values:
      A     B   C
0  1.0   5.0  10
3  4.0   8.0  40
4  5.0  10.0  50


In [None]:
from sklearn.impute import SimpleImputer


# Constant Value Imputation
df_constant_fill = df.copy()
imputer = SimpleImputer(strategy='constant', fill_value=-1)
df_constant_fill = pd.DataFrame(imputer.fit_transform(df_constant_fill), columns=df_constant_fill.columns)

print("\nDataFrame with Constant Value Imputation:\n", df_constant_fill)


DataFrame with Constant Value Imputation:
      A     B     C
0  1.0   5.0  10.0
1  2.0  -1.0  20.0
2 -1.0  -1.0  30.0
3  4.0   8.0  40.0
4  5.0  10.0  50.0


In [9]:
df_linear_interpolated = df.copy()
df_linear_interpolated = df_linear_interpolated.interpolate(method='linear')
print("\nDataFrame with Linear Interpolation:\n", df_linear_interpolated)


DataFrame with Linear Interpolation:
      A     B   C
0  1.0   5.0  10
1  2.0   6.0  20
2  3.0   7.0  30
3  4.0   8.0  40
4  5.0  10.0  50


In [5]:
from sklearn.impute import KNNImputer

imputer_knn = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(imputer_knn.fit_transform(df), columns=df.columns)
print("\nDataFrame with KNN Imputation:\n", df_knn_imputed)


DataFrame with KNN Imputation:
      A     B     C
0  1.0   5.0  10.0
1  2.0   6.5  20.0
2  3.0   6.5  30.0
3  4.0   8.0  40.0
4  5.0  10.0  50.0


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import seaborn as sns

# Load dataset
titanic = sns.load_dataset('titanic')

# Handling categorical data
titanic['sex'] = titanic['sex'].map({'male': 0, 'female': 1})

# Selecting features
features = ['pclass', 'sex', 'sibsp', 'parch', 'fare']

# Rows where 'Age' is missing
missing_age = titanic[titanic['age'].isnull()]
missing_age_features = missing_age[features]

# Rows where 'Age' is not missing
not_missing_age = titanic[titanic['age'].notnull()]

X = not_missing_age[features]
y = not_missing_age['age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)
# Predict on test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
# Predict missing values
predicted_ages = model.predict(missing_age_features)

# Fill in missing values
titanic.loc[titanic['age'].isnull(), 'age'] = predicted_ages


Mean Squared Error: 139.81319441825693


In [11]:
# Display the count of missing values per column after imputation
print("\nMissing values after imputation:\n", titanic.isnull().sum())


Missing values after imputation:
 survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [3]:
!pip install fancyimpute

Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29880 sha256=f89bb4c8b8722142cbe983cbdb11518d56b97b08d1bb13c79b4904a59ed91c03
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-none-

In [7]:
from fancyimpute import IterativeImputer

imputer_multiple = IterativeImputer()
df_multiple_imputed = pd.DataFrame(imputer_multiple.fit_transform(df), columns=df.columns)
print("\nDataFrame with IterativeImputer:\n", df_multiple_imputed)


DataFrame with IterativeImputer:
      A          B     C
0  1.0   5.000000  10.0
1  2.0   6.099252  20.0
2  3.0   7.274813  30.0
3  4.0   8.000000  40.0
4  5.0  10.000000  50.0


Analyzing missing values in titanic dataset

In [8]:
import seaborn as sns
import pandas as pd

# Load the Titanic dataset
titanic = sns.load_dataset('titanic')

# Display the count of missing values per column before imputation
print("Missing values before imputation:\n", titanic.isnull().sum())

# Mean Imputation for the 'Age' column
titanic['age'].fillna(titanic['age'].mean(), inplace=True)

# Mode Imputation for the 'deck' column
mode_deck = titanic['deck'].mode()[0]
titanic['deck'].fillna(mode_deck, inplace=True)

# Display the count of missing values per column after imputation
print("\nMissing values after imputation:\n", titanic.isnull().sum())


Missing values before imputation:
 survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

Missing values after imputation:
 survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
deck           0
embark_town    2
alive          0
alone          0
dtype: int64
