In [13]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

In [14]:
data = {
    'feature': [1, 2, 3, 4, 5, None, 7, 8, 9, 10],
    'target': [2, 4, 6, None, 10, 12, 14, 16, 18, 20]
}
df = pd.DataFrame(data)

In [20]:
# a. Filling the Missing Values – Imputation
imputer = SimpleImputer(strategy='mean')  # Imputer to fill missing values with mean
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print("DataFrame after filling missing values with mean:")
print(df_imputed)

DataFrame after filling missing values with mean:
     feature     target  target_imputed  target_predicted
0   1.000000   2.000000        2.000000               2.0
1   2.000000   4.000000        4.000000               4.0
2   3.000000   6.000000        6.000000               6.0
3   4.000000  11.333333       11.333333               8.0
4   5.000000  10.000000       10.000000              10.0
5   5.444444  12.000000       12.000000              12.0
6   7.000000  14.000000       14.000000              14.0
7   8.000000  16.000000       16.000000              16.0
8   9.000000  18.000000       18.000000              18.0
9  10.000000  20.000000       20.000000              20.0


In [21]:
# b. Imputation with an additional column
df['target_imputed'] = df['target'].fillna(df['target'].mean())
print("\nDataFrame with missing values imputed with mean:")
print(df)


DataFrame with missing values imputed with mean:
   feature  target  target_imputed  target_predicted
0      1.0     2.0        2.000000               2.0
1      2.0     4.0        4.000000               4.0
2      3.0     6.0        6.000000               6.0
3      4.0     NaN       11.333333               8.0
4      5.0    10.0       10.000000              10.0
5      NaN    12.0       12.000000              12.0
6      7.0    14.0       14.000000              14.0
7      8.0    16.0       16.000000              16.0
8      9.0    18.0       18.000000              18.0
9     10.0    20.0       20.000000              20.0


In [22]:
# c. Filling with a Regression Model
# Drop rows with missing values
df_clean = df.dropna()

In [23]:
# Create and fit the model
model = LinearRegression()
model.fit(df_clean[['feature']], df_clean['target'])

In [24]:
# Predict missing values
df['target_predicted'] = df['target'].copy()
df.loc[df['target'].isnull(), 'target_predicted'] = model.predict(df[df['target'].isnull()][['feature']])
print("\nDataFrame after filling missing values with a regression model:")
print(df)


DataFrame after filling missing values with a regression model:
   feature  target  target_imputed  target_predicted
0      1.0     2.0        2.000000               2.0
1      2.0     4.0        4.000000               4.0
2      3.0     6.0        6.000000               6.0
3      4.0     NaN       11.333333               8.0
4      5.0    10.0       10.000000              10.0
5      NaN    12.0       12.000000              12.0
6      7.0    14.0       14.000000              14.0
7      8.0    16.0       16.000000              16.0
8      9.0    18.0       18.000000              18.0
9     10.0    20.0       20.000000              20.0
