# Program 1: 
1. Handle mising features
2. Data normallization
3. Data scaling
4. Feature filtering
5. Feature selection

## Handle Missing Features

### missingno is used to visualise missing data

In [None]:
import missingno as msno
import pandas as pd
import numpy as np
import sklearn.preprocessing as pp
import sklearn.impute as imp

In [None]:
students = pd.read_csv("data/stud_details.csv")

msno.matrix(students.sample(30))
# msno.dendrogram(students.sample(30))
# msno.bar(students.sample(30))
# msno.heatmap(students.sample(30))

### Using Pandas to handle missing data

In [None]:
students.replace(r'^\s*$', np.nan, regex=True)
students.info()
fname = students['Faculty Name'].mode()[0]
students['Faculty Name'] = fname
students.iloc[:, :3]

### Using ScikitLearn to impute missing data

In [None]:
imputer = imp.SimpleImputer(strategy='most_frequent')
df = pd.DataFrame([["a", "x"],

                   [np.nan, "y"],

                   ["a", np.nan],

                   ["b", "y"]], dtype="category")
print(imputer.fit_transform(df))

## Data Normalization
- Adjusts values to fit within a specific range, typically between 0 and 1.
- Sensitive to outliers
1. Z-score (standardization)
2. Minmax

In [None]:
### Z-score normalisation

#### Vanilla - using numpy

data = np.ndarray([1, 2, 4, 5, 6, 8, 54, 99, 23]).reshape(-1, 1) # shape into a column vectore
mean = np.mean(data)
sd = np.std(data)

norm_data = (data - mean)/sd
print(norm_data)

#### Using sklearn

zscore_scaler = pp.StandardScaler()
zscore_normalized = zscore_scaler.fit_transform(data)
print("Z-score Normalization:")
print(zscore_normalized.flatten())

In [None]:
### Minmax normalization
#### Using sklearn

minmax_scaler = pp.MinMaxScaler()
minmax_normalized = minmax_scaler.fit_transform(data)
print("\nMin-Max Normalization:")
print(minmax_normalized.flatten())

## Data Scaling
- Adjusts values to have a mean of 0 and a standard deviation of 1, without necessarily constraining them to a specific range.

In [None]:
### Robust Scaling (scaling based on median and IQR)
robust_scaler = pp.RobustScaler()
robust_scaled = robust_scaler.fit_transform(data)

### Normalization (scaling to unit norm)
normalizer = pp.Normalizer()
normalized_data = normalizer.fit_transform(data)

print("\nRobust Scaling:")
print(robust_scaled.flatten())
print("\nNormalization (Unit Norm):")
print(normalized_data.flatten())