## Imputation

## Numerical Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

data = {
    'Name': ['AAA', 'BBB', 'CCC', 'DDD', 'EEE'],
    'Age': [45,65,np.nan,46,23],
    'Score': [43,55,46,np.nan,87],
    'Grade': ['A', ' B', 'C', 'A', np.nan]
}

df = pd.DataFrame(data)
print("Original Data", df)

Original Data   Name   Age  Score Grade
0  AAA  45.0   43.0     A
1  BBB  65.0   55.0     B
2  CCC   NaN   46.0     C
3  DDD  46.0    NaN     A
4  EEE  23.0   87.0   NaN


In [None]:
df.isnull().sum()

Name     0
Age      1
Score    1
Grade    1
dtype: int64

In [None]:
imputer = SimpleImputer(strategy = 'most_frequent')
df['Grade'] = imputer.fit_transform(df[['Grade']])
df

Unnamed: 0,Name,Age,Score,Grade
0,AAA,45.0,43.0,A
1,BBB,65.0,55.0,B
2,CCC,,46.0,C
3,DDD,46.0,,A
4,EEE,23.0,87.0,A


#### Imputation Using Mean, Median, Mode

In [None]:
imputer = SimpleImputer(strategy = 'mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df[['Age', 'Score']]), columns = ['Age', 'Score'])
df_imputed

Unnamed: 0,Age,Score
0,45.0,43.0
1,65.0,55.0
2,44.75,46.0
3,46.0,57.75
4,23.0,87.0


In [None]:
df['Age'] = df_imputed['Age']
df['Score'] = df_imputed['Score']
print ("Imputed Data" , df)

Imputed Data   Name    Age  Score Grade
0  AAA  45.00  43.00     A
1  BBB  65.00  55.00     B
2  CCC  44.75  46.00     C
3  DDD  46.00  57.75     A
4  EEE  23.00  87.00   NaN


#### Forward Fill/Backward Fill

In [None]:
import pandas as pd
import numpy as np

data = {
    'Name' : ['AAA', 'BBB', 'CCC', 'DDD', 'EEE'],
    'Age' : [34,np.nan,33,25,26],
    'Score': [50,60,40,np.nan,50],
    'Grade': ['A', 'B', np.nan, 'C', 'A']
}

df = pd.DataFrame(data)
print(df)


  Name   Age  Score Grade
0  AAA  34.0   50.0     A
1  BBB   NaN   60.0     B
2  CCC  33.0   40.0   NaN
3  DDD  25.0    NaN     C
4  EEE  26.0   50.0     A


In [None]:
df_ffill = df.fillna(method = 'ffill')
df_bfill = df.fillna(method = 'bfill')
print("Forward Filled")
print(df_ffill)

print('Backward Filled')
print(df_bfill)


Forward Filled
  Name   Age  Score Grade
0  AAA  34.0   50.0     A
1  BBB  34.0   60.0     B
2  CCC  33.0   40.0     B
3  DDD  25.0   40.0     C
4  EEE  26.0   50.0     A
Backward Filled
  Name   Age  Score Grade
0  AAA  34.0   50.0     A
1  BBB  33.0   60.0     B
2  CCC  33.0   40.0     C
3  DDD  25.0   50.0     C
4  EEE  26.0   50.0     A


### Interpolation

Interpolation in feature engineering refers to the process of estimating or predicting missing values within a dataset based on the values of neighboring data points.

#### Linear Interpolation

Time	Temperature

0	       20

1	       NaN

2	       25

3	       NaN

4	       30


Temperature at time 1 = Temperature at time 0+Temperature at time 2/ 2                       = 20+25/2 = 22.5

Temperature at time 3 = Temperature at time 2+Temperature at time 4/2
                      = 25+30/2 = 27.5
                      
                      
#### Nearest Neighbor Interpolation

Region	      Rainfall (inches)

Region A	     2.5

Region B	     NaN

Region C	     NaN

Region D	     3.0

Region E	     NaN


We can use nearest neighbor interpolation to estimate the missing population density values. Nearest neighbor interpolation assigns the value of the nearest known neighbor to the missing point.

For Region B, the nearest known neighbor is Region A, so we could assign the rainfall for Region B to be 2.5 inches.

For Region C, the nearest known neighbors are Regions A and D. We could take the average of their rainfall measurements: (2.5 + 3.0) / 2 = 2.75 inches.


In [None]:
import pandas as pd
import numpy as np

data = {
    'Name' : ['AAA', 'BBB', 'CCC', 'DDD', 'EEE'],
    'Age' : [34,np.nan,33,25,26],
    'Score': [50,60,40,np.nan,50],
    'Grade': ['A', 'B', np.nan, 'C', 'A']
}
df = pd.DataFrame(data)
print(df)


  Name   Age  Score Grade
0  AAA  34.0   50.0     A
1  BBB   NaN   60.0     B
2  CCC  33.0   40.0   NaN
3  DDD  25.0    NaN     C
4  EEE  26.0   50.0     A


In [None]:
df_interpolated = df.interpolate()
print("After Interpolated")
print(df_interpolated)

After Interpolated
  Name   Age  Score Grade
0  AAA  34.0   50.0     A
1  BBB  33.5   60.0     B
2  CCC  33.0   40.0   NaN
3  DDD  25.0   45.0     C
4  EEE  26.0   50.0     A


#### K-nearest neighbors imputation

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

data = {
    'Name' : ['AAA', 'BBB', 'CCC', 'DDD', 'EEE'],
    'Age' : [34,np.nan,33,25,26],
    'Score': [50,60,40,np.nan,50],
    'Grade': ['A', 'B', np.nan, 'C', 'A']
}
df = pd.DataFrame(data)
print(df)

knn_imputer_age = KNNImputer(n_neighbors = 2)
df['Age'] = knn_imputer_age.fit_transform(df[['Age']])

knn_imputer_age = KNNImputer(n_neighbors = 2)
df['Score'] = knn_imputer_age.fit_transform(df[['Score']])

print("After KNN Imputation")
print(df)


  Name   Age  Score Grade
0  AAA  34.0   50.0     A
1  BBB   NaN   60.0     B
2  CCC  33.0   40.0   NaN
3  DDD  25.0    NaN     C
4  EEE  26.0   50.0     A
After KNN Imputation
  Name   Age  Score Grade
0  AAA  34.0   50.0     A
1  BBB  29.5   60.0     B
2  CCC  33.0   40.0   NaN
3  DDD  25.0   50.0     C
4  EEE  26.0   50.0     A


## Categorical Data

### One-hot encoding

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
data = {

    'Fruit' : ['Apple', 'Banana', 'Orange', 'Apple', 'Banana'],
    'Color' : ['Red', 'Yellow', 'Orange', 'Green', 'Yellow']
}
df = pd.DataFrame(data)
print(df)


#one_hot_encoder = OneHotEncoder(sparse = False, drop = 'first')
one_hot_encoder = OneHotEncoder(sparse = False)

one_hot_encoded = one_hot_encoder.fit_transform(df[['Fruit', 'Color']])
print(one_hot_encoded)


feature_names = one_hot_encoder.get_feature_names_out(['Fruit', 'Color'])
print("\n\n Featture Names : \n", feature_names)
df_one_hot = pd.DataFrame(one_hot_encoded, columns = feature_names)
print(df_one_hot)

df_encoded = pd.concat([df.drop(['Fruit','Color'], axis=1),df_one_hot],axis=1)

print(df_encoded)



    Fruit   Color
0   Apple     Red
1  Banana  Yellow
2  Orange  Orange
3   Apple   Green
4  Banana  Yellow
[[1. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 1. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 1.]]


 Featture Names : 
 ['Fruit_Apple' 'Fruit_Banana' 'Fruit_Orange' 'Color_Green' 'Color_Orange'
 'Color_Red' 'Color_Yellow']
   Fruit_Apple  Fruit_Banana  Fruit_Orange  Color_Green  Color_Orange  \
0          1.0           0.0           0.0          0.0           0.0   
1          0.0           1.0           0.0          0.0           0.0   
2          0.0           0.0           1.0          0.0           1.0   
3          1.0           0.0           0.0          1.0           0.0   
4          0.0           1.0           0.0          0.0           0.0   

   Color_Red  Color_Yellow  
0        1.0           0.0  
1        0.0           1.0  
2        0.0           0.0  
3        0.0           0.0  
4        0.0           1.0  
   Fruit_Apple  Fruit_Banana  Fru

### Label encoding

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data = {

    'Fruit' : ['Apple', 'Banana', 'Orange', 'Apple', 'Banana'],
    'Color' : ['Red', 'Yellow', 'Orange', 'Green', 'Yellow']
}
df = pd.DataFrame(data)
print(df)

label_encoder = LabelEncoder()
df_label_encoded = df.copy()
df_label_encoded['Fruit'] = label_encoder.fit_transform(df['Fruit'])
df_label_encoded['Color'] = label_encoder.fit_transform(df['Color'])

print("Label Encoding")
print(df_encoded)

    Fruit   Color
0   Apple     Red
1  Banana  Yellow
2  Orange  Orange
3   Apple   Green
4  Banana  Yellow
Label Encoding
   Fruit_Banana  Fruit_Orange  Color_Orange  Color_Red  Color_Yellow
0           0.0           0.0           0.0        1.0           0.0
1           1.0           0.0           0.0        0.0           1.0
2           0.0           1.0           1.0        0.0           0.0
3           0.0           0.0           0.0        0.0           0.0
4           1.0           0.0           0.0        0.0           1.0


## Feature scaling

#### Normalization

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

data = {

    'Age' : [34,43,33,25,26],
    'Score': [50,60,40,45,50]
}
df = pd.DataFrame(data)
print(df)

min_max_scaler = MinMaxScaler()
df_normalized = min_max_scaler.fit_transform(df)
df_normalized = pd.DataFrame(df_normalized, columns = df.columns)

print("Min Max Scaling. (Normalization)")
print(df_normalized)

   Age  Score
0   34     50
1   43     60
2   33     40
3   25     45
4   26     50
Min Max Scaling. (Normalization)
        Age  Score
0  0.500000   0.50
1  1.000000   1.00
2  0.444444   0.00
3  0.000000   0.25
4  0.055556   0.50


#### Standardization

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

data = {

    'Age' : [34,43,33,25,26],
    'Score': [50,60,40,45,50]
}
df = pd.DataFrame(data)
print(df)

standard_scaler = StandardScaler()
df_standardized = standard_scaler.fit_transform(df)
df_standardized = pd.DataFrame(df_standardized, columns = df.columns)

print("After Standardization ")
print(df_standardized)

   Age  Score
0   34     50
1   43     60
2   33     40
3   25     45
4   26     50
After Standardization 
        Age     Score
0  0.277218  0.150756
1  1.663311  1.658312
2  0.123208 -1.356801
3 -1.108874 -0.603023
4 -0.954864  0.150756


## Feature selection

Feature Selection is a feature engineering technique that selects only dominating or relevant features in a dataset. It uses algorithms to determine which features have the most impact or relation to the target variable. When a model is trained only with the relevant features selected, it can improve the machine learning model’s accuracy.
Feature Selction Techniques:
- Univariate Feature selection
- L1 Regularization (Lasso)

### Univariate feature selection:
Univariate feature selection removes all features whose variance doesn’t meet a particular threshold value.

It gives top k features based on their relevance with the target variable using the f_regression score function.

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import Lasso

In [None]:
# Generating sample dataset with synthetic features and target
X, y = make_regression (n_samples = 100, n_features = 10, random_state = 42)

df = pd.DataFrame(X, columns = [f'Feature_{i}'  for i in range(1,11)])
df['Target'] = y

df.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Target
0,0.150394,0.950424,-0.759133,-2.123896,-0.576904,-0.599393,-0.525755,-0.839722,0.341756,1.876171,-60.486955
1,1.088951,-0.715304,-0.471932,0.68626,0.679598,-1.867265,-1.612716,2.314659,0.06428,-1.077745,-0.091994
2,-0.714351,-1.191303,0.293072,-0.680025,0.656554,0.346448,0.232254,0.250493,1.865775,0.473833,-76.947132
3,0.404051,0.25755,-0.161286,-0.342715,-0.074446,-0.420645,-0.802277,-1.415371,1.886186,0.174578,-38.24517
4,0.60601,-2.081929,-1.556629,-0.704344,1.696456,1.049009,-1.408461,-0.522723,-1.280429,1.754794,-1.831243


In [None]:
k_best = 5
selector = SelectKBest(score_func = f_regression, k = k_best)
X_univariate = selector.fit_transform(X,y)
print(X_univariate)
selected_features = df.columns[:-1][selector.get_support()]
print("\n\n",df.columns[:-1])
print("\n\n",selector.get_support())

print("\n\nSelected Features \n",selected_features)

[[ 0.15039379  0.95042384 -2.12389572 -0.57690366 -0.52575502]
 [ 1.0889506  -0.71530371  0.68626019  0.67959775 -1.61271587]
 [-0.71435142 -1.1913035  -0.68002472  0.65655361  0.2322537 ]
 [ 0.40405086  0.25755039 -0.34271452 -0.07444592 -0.80227727]
 [ 0.60600995 -2.08192941 -0.70434369  1.69645637 -1.4084613 ]
 [ 0.41278093 -0.24538812  1.15859558 -0.75373616 -0.82068232]
 [ 0.75539123  0.09933231 -0.23894805  0.75138712 -0.90756366]
 [-0.50175704 -0.5297602   1.47789404  0.51326743 -0.51827022]
 [-0.71984421  0.34361829 -0.11564828 -1.76304016 -0.3011037 ]
 [-0.46917565 -0.11453985 -1.32023321  1.23781631  1.83145877]
 [-0.51121568  0.05572491  1.8820245   1.09419152  1.34542005]
 [ 1.11729583  0.56976728 -1.25153942  0.44770856  1.4437646 ]
 [-0.79287283 -0.55364931 -0.98572605 -1.19787789  0.50404652]
 [-0.76734756  2.18980293  1.45114361 -0.80829829  0.95927083]
 [ 1.45338448 -0.42018682  2.29889812 -0.28178461 -0.36283856]
 [-0.74848654  1.17929718 -1.53411417  0.06751848  1.27

### L1 regularization (Lasso):
Lasso regression algorithm can reduce the coefficients and remove those with lower values.

**Statistics and Regression Analysis:**
In statistics, especially in regression analysis, a coefficient is a number that represents the relationship between an independent variable and a dependent variable.

For example, in a linear regression equation

**( y = \beta_0 + \beta_1x )**

**( \beta_1 )** is the coefficient that quantifies the effect of the independent variable ( x ) on the dependent variable ( y ).

The inclusion of the L1 term has a unique effect on the regression solution. It tends to produce sparse models, meaning that some of the coefficient estimates can be exactly zero. This is because the L1 penalty forces some coefficient estimates to shrink towards zero, effectively selecting a simpler model that uses fewer predictors. This property of Lasso regression makes it particularly useful for feature selection in models with a large number of predictors.

In [None]:
df.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Target
0,0.150394,0.950424,-0.759133,-2.123896,-0.576904,-0.599393,-0.525755,-0.839722,0.341756,1.876171,-60.486955
1,1.088951,-0.715304,-0.471932,0.68626,0.679598,-1.867265,-1.612716,2.314659,0.06428,-1.077745,-0.091994
2,-0.714351,-1.191303,0.293072,-0.680025,0.656554,0.346448,0.232254,0.250493,1.865775,0.473833,-76.947132
3,0.404051,0.25755,-0.161286,-0.342715,-0.074446,-0.420645,-0.802277,-1.415371,1.886186,0.174578,-38.24517
4,0.60601,-2.081929,-1.556629,-0.704344,1.696456,1.049009,-1.408461,-0.522723,-1.280429,1.754794,-1.831243


In [None]:
k_best = 5
lasso = Lasso(alpha = 0.1)
lasso.fit(X,y)
coefficients = np.abs(lasso.coef_)
print(coefficients)

selected_features_Lasso = df.columns[:-1][np.argsort(coefficients)[::-1]] [:k_best]

[93.47376799 70.77628722  3.08651943 63.57576455 86.96518973 10.26537823
 70.60448372 16.63432809  5.03492642 54.04520405]


In [None]:
print(df.columns[:-1])
print([np.argsort(coefficients)])
print([np.argsort(coefficients)[::-1]])

Index(['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5',
       'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10'],
      dtype='object')
[array([2, 8, 5, 7, 9, 3, 6, 1, 4, 0])]
[array([0, 4, 1, 6, 3, 9, 7, 5, 8, 2])]


In [None]:
print(selected_features_Lasso)

Index(['Feature_1', 'Feature_5', 'Feature_2', 'Feature_7', 'Feature_4'], dtype='object')


In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create SVM classifiers with different values of C
C_values = [0.1, 1, 10, 100]  # Different values of C to try
for C in C_values:
    # Create an SVM classifier with the specified C value
    svm_model = SVC(kernel='linear', C=C)

    # Train the SVM classifier
    svm_model.fit(X_train, y_train)

    # Predict the labels for the test set
    y_pred = svm_model.predict(X_test)

    # Calculate and print the accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy with C={C}: {accuracy}")


Accuracy with C=0.1: 1.0
Accuracy with C=1: 1.0
Accuracy with C=10: 0.9777777777777777
Accuracy with C=100: 1.0
