Feature Construction

In [1]:
import pandas as pd
# Example: Creating Derived Features
data = {
'length': [20, 25, 30, 22],
'breadth': [15, 20, 18, 25],
'price': [200000, 250000, 300000, 220000]
}
df = pd.DataFrame(data)
df['area'] = df['length'] * df['breadth']
df

Unnamed: 0,length,breadth,price,area
0,20,15,200000,300
1,25,20,250000,500
2,30,18,300000,540
3,22,25,220000,550


In [2]:
# Example: Encoding Nominal Variables
data = {
'city': ['A', 'B', 'C', 'A'],
'parents_athlete': ['Y', 'N', 'N', 'Y'],
'chance_of_win': ['Y', 'N', 'Y', 'N']
}
df = pd.DataFrame(data)
pd.get_dummies(df, drop_first=True)

Unnamed: 0,city_B,city_C,parents_athlete_Y,chance_of_win_Y
0,False,False,True,True
1,True,False,False,False
2,False,True,False,True
3,False,False,True,False


In [3]:
 # Example: Encoding Ordinal Variables
data = {'grade': ['A', 'B', 'C', 'D', 'A']}
df = pd.DataFrame(data)
grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4}
df['num_grade'] = df['grade'].map(grade_map)
df

Unnamed: 0,grade,num_grade
0,A,1
1,B,2
2,C,3
3,D,4
4,A,1


In [4]:
# Example: Binning Continuous Variables
import numpy as np
df = pd.DataFrame({'price': [200000, 350000, 600000, 800000]})
bins = [0, 300000, 600000, np.inf]
labels = ['Low', 'Medium', 'High']
df['price_category'] = pd.cut(df['price'], bins=bins, labels=labels)
df

Unnamed: 0,price,price_category
0,200000,Low
1,350000,Medium
2,600000,Medium
3,800000,High


 Feature Extraction

In [5]:
# PCA Example
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
# Load iris dataset
iris = load_iris()
X = iris.data
# Apply PCA (reduce to 2 components)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print("Original shape:", X.shape)
print("Reduced shape:", X_pca.shape)

Original shape: (150, 4)
Reduced shape: (150, 2)


In [6]:
# SVD Example
import numpy as np
from sklearn.decomposition import TruncatedSVD
# Random matrix (simulating text data)
X = np.random.rand(5, 4)
# Apply Truncated SVD
svd = TruncatedSVD(n_components=2)
X_svd = svd.fit_transform(X)
print("Original:", X.shape)
print("Reduced:", X_svd.shape)

Original: (5, 4)
Reduced: (5, 2)


In [7]:
 # LDA Example
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, y)
print("Original shape:", X.shape)
print("Reduced shape:", X_lda.shape)

Original shape: (150, 4)
Reduced shape: (150, 2)


Feature Selection

In [8]:
# Filter Method (Chi-Square)
from sklearn.feature_selection import SelectKBest, chi2
X_new = SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)
print('Original:', iris.data.shape, 'Reduced:', X_new.shape)

Original: (150, 4) Reduced: (150, 2)


In [12]:
#Alternate Method(Chi-Square)
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.datasets import load_iris

#Load iris dataset as DataFrame
iris = load_iris(as_frame=True)
df_i = iris.frame

#Separate features and target using iloc
X= df_i.iloc[:,:-1] #all columns except last
y = df_i.iloc[:,-1] #last column

#Apply Chi-Squre test
X_new = SelectKBest(chi2,k=1).fit_transform(X,y)

print("Original shape: ",X.shape)
print("Reduced shape: ",X_new.shape)

Original shape:  (150, 4)
Reduced shape:  (150, 1)


In [11]:
#Wrapper Method- Recurrsive Feature Elimination(RFE)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
X,y=iris.data,iris.target
model = LogisticRegression(max_iter=200)

#Apply RFE for 2 features
rfe = RFE(model,n_features_to_select=2)
fit = rfe.fit(X,y)

print("Selected Features: ",fit.support_)
print("Feature Ranking: ",fit.ranking_)

Selected Features:  [False False  True  True]
Feature Ranking:  [3 2 1 1]


In [13]:
# Embedded Method (Lasso)
from sklearn.linear_model import LassoCV
from sklearn.datasets import load_diabetes
# Load diabetes dataset as DataFrame
diabetes = load_diabetes(as_frame=True)
df_d = diabetes.frame
# Separate features and target using iloc
X = df_d.iloc[:, :-1]
y = df_d.iloc[:, -1]
# Apply LassoCV
lasso = LassoCV(cv=5)
lasso.fit(X, y)
print("Coefficients:", lasso.coef_)
print("Number of selected features:", sum(lasso.coef_ != 0))

Coefficients: [  -6.49469328 -235.99308032  521.7443693   321.0607768  -569.43813385
  302.45319289   -0.          143.69851474  669.92267515   66.83551067]
Number of selected features: 9
