## 1. Dealing with missing data

reference:
https://www.kdnuggets.com/publications/sheets/Data_Cleaning_with_Python_Cheat_Sheet_Anello.pdf

In [3]:
import pandas as pd
import numpy as np

# Creating a sample DataFrame
data = {
    'municipal': [1, np.nan, 3, 4, 5],
    'city': [np.nan, 'A', 'B', np.nan, 'C'],
    'price': [100, np.nan, 150, 200, np.nan],
    'age': [5, 10, np.nan, np.nan, 8],
    'type_building': ['House', 'Apartment', np.nan, 'House', np.nan],
    'stock_price': [50, 55, np.nan, np.nan, 60]
}

df = pd.DataFrame(data)

# Displaying the original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
   municipal city  price   age type_building  stock_price
0        1.0  NaN  100.0   5.0         House         50.0
1        NaN    A    NaN  10.0     Apartment         55.0
2        3.0    B  150.0   NaN           NaN          NaN
3        4.0  NaN  200.0   NaN         House          NaN
4        5.0    C    NaN   8.0           NaN         60.0


In [6]:
# Check and display the number of missing values in each column
print(df.isnull().sum())


Series([], dtype: float64)


In [8]:
# Delete rows with all missing data
b = df.dropna(how='all', inplace=True)
print(b)

None


In [10]:
# Drop columns that have missing values
c = df.dropna(axis=1, how='any', inplace=True)
print(c)

None


In [11]:
# Drop specific columns that have missing values
d = df.dropna(subset=['municipal', 'city'], inplace=True)
print(d)

KeyError: ['municipal', 'city']

In [12]:
# Replace missing values in 'price' column with mean
e = df['price'].fillna(df['price'].mean(), inplace=True)
print(e)

KeyError: 'price'

In [None]:
# Replace missing values in 'age' column with median
df['age'].fillna(df['age'].median(), inplace=True)

# Replace missing values in 'type_building' column with mode
df['type_building'].fillna(df['type_building'].mode().iloc[0], inplace=True)

# Forward Fill - Fill missing values with values before them
df.fillna(method='ffill', inplace=True)

# Backward Fill - Fill missing values with values after them
df.fillna(method='bfill', inplace=True)

# Fill missing values using the interpolation method (polynomial order 2)
df['stock_price'] = df['stock_price'].interpolate(method='polynomial', order=2)