# Program 1: 
1. Handle mising features
2. Data normallization
3. Data scaling
4. Feature filtering
5. Feature selection

## Handle Missing Features

### missingno is used to visualise missing data

In [2]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as pp
import sklearn.impute as imp

In [3]:
students = pd.read_csv("data/student_details.csv")

### Using Pandas to handle missing data

In [4]:
students.replace(r'^\s*$', np.nan, regex=True)
students.info()
fname = students['Faculty Name'].mode()[0]
students['Faculty Name'] = fname
students['Faculty Name']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               41 non-null     int64  
 1   Unnamed: 1                       0 non-null      float64
 2   Unnamed: 2                       0 non-null      float64
 3   Unnamed: 3                       0 non-null      float64
 4   Name2                            41 non-null     object 
 5   Roll number                      41 non-null     object 
 6   Semester                         41 non-null     object 
 7   Gender                           41 non-null     object 
 8   School                           41 non-null     object 
 9   Faculty ID                       6 non-null      object 
 10  Faculty Name                     39 non-null     object 
 11  Home Department                  41 non-null     object 
 12  Student Contact Number  

0     Shubham Kumar
1     Shubham Kumar
2     Shubham Kumar
3     Shubham Kumar
4     Shubham Kumar
5     Shubham Kumar
6     Shubham Kumar
7     Shubham Kumar
8     Shubham Kumar
9     Shubham Kumar
10    Shubham Kumar
11    Shubham Kumar
12    Shubham Kumar
13    Shubham Kumar
14    Shubham Kumar
15    Shubham Kumar
16    Shubham Kumar
17    Shubham Kumar
18    Shubham Kumar
19    Shubham Kumar
20    Shubham Kumar
21    Shubham Kumar
22    Shubham Kumar
23    Shubham Kumar
24    Shubham Kumar
25    Shubham Kumar
26    Shubham Kumar
27    Shubham Kumar
28    Shubham Kumar
29    Shubham Kumar
30    Shubham Kumar
31    Shubham Kumar
32    Shubham Kumar
33    Shubham Kumar
34    Shubham Kumar
35    Shubham Kumar
36    Shubham Kumar
37    Shubham Kumar
38    Shubham Kumar
39    Shubham Kumar
40    Shubham Kumar
Name: Faculty Name, dtype: object

### Using ScikitLearn to impute missing data

In [5]:
imputer = imp.SimpleImputer(strategy='most_frequent')
df = pd.DataFrame([["a", "x"],
                   [np.nan, "y"],
                   ["a", np.nan],
                   ["b", "y"]], dtype="category")

print(imputer.fit_transform(df))

[['a' 'x']
 ['a' 'y']
 ['a' 'y']
 ['b' 'y']]


## Data Normalization
- Adjusts values to fit within a specific range, typically between 0 and 1.
- Sensitive to outliers
1. Z-score (standardization)
2. Minmax

In [9]:
### Z-score normalisation

#### Vanilla - using numpy

data = np.arange(10).reshape(-1, 1) # shape into a column vectore
mean = np.mean(data)
sd = np.std(data)

norm_data = (data - mean)/sd
print(norm_data)

#### Using sklearn

zscore_scaler = pp.StandardScaler()
zscore_normalized = zscore_scaler.fit_transform(data)
print("Z-score Normalization:")
print(zscore_normalized.flatten())

[[-1.5666989 ]
 [-1.21854359]
 [-0.87038828]
 [-0.52223297]
 [-0.17407766]
 [ 0.17407766]
 [ 0.52223297]
 [ 0.87038828]
 [ 1.21854359]
 [ 1.5666989 ]]
Z-score Normalization:
[-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
  0.52223297  0.87038828  1.21854359  1.5666989 ]


In [10]:
### Minmax normalization
#### Using sklearn

minmax_scaler = pp.MinMaxScaler()
minmax_normalized = minmax_scaler.fit_transform(data)
print("\nMin-Max Normalization:")
print(minmax_normalized.flatten())


Min-Max Normalization:
[0.         0.11111111 0.22222222 0.33333333 0.44444444 0.55555556
 0.66666667 0.77777778 0.88888889 1.        ]


## Data Scaling
- Adjusts values to have a mean of 0 and a standard deviation of 1, without necessarily constraining them to a specific range.

In [11]:
### Robust Scaling (scaling based on median and IQR)
robust_scaler = pp.RobustScaler()
robust_scaled = robust_scaler.fit_transform(data)

### Normalization (scaling to unit norm)
normalizer = pp.Normalizer()
normalized_data = normalizer.fit_transform(data)

print("\nRobust Scaling:")
print(robust_scaled.flatten())
print("\nNormalization (Unit Norm):")
print(normalized_data.flatten())


Robust Scaling:
[-1.         -0.77777778 -0.55555556 -0.33333333 -0.11111111  0.11111111
  0.33333333  0.55555556  0.77777778  1.        ]

Normalization (Unit Norm):
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
