<a href="https://colab.research.google.com/github/mukulre/Projects/blob/main/Feature_Engineering_Practical_Concepts_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# sample data
data = {'Region': ['North', 'South', 'East', 'West', 'North']}
df = pd.DataFrame(data)

# label encoding
label_encoder = LabelEncoder()
df['Region_Label'] = label_encoder.fit_transform(df['Region'])

# one-hot encoding
one_hot_encoder = OneHotEncoder(sparse_output=False)
encoded = one_hot_encoder.fit_transform(df[['Region']])
encoded_df = pd.DataFrame(encoded, columns=one_hot_encoder.get_feature_names_out(['Region']))

# combine original data with one-hot encoded columns
df = pd.concat([df, encoded_df], axis=1)
print(df)

  Region  Region_Label  Region_East  Region_North  Region_South  Region_West
0  North             1          0.0           1.0           0.0          0.0
1  South             2          0.0           0.0           1.0          0.0
2   East             0          1.0           0.0           0.0          0.0
3   West             3          0.0           0.0           0.0          1.0
4  North             1          0.0           1.0           0.0          0.0


In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

# sample data
data = {'Age': [25, 30, 35, 40, 45], 'Salary': [50000, 60000, None, 80000, None]}
df = pd.DataFrame(data)

# separate rows with and without missing values
df_with_missing = df[df['Salary'].isnull()]
df_without_missing = df[df['Salary'].notnull()]

# train a regression model to predict missing values
model = LinearRegression()
model.fit(df_without_missing[['Age']], df_without_missing['Salary'])
predicted_salaries = model.predict(df_with_missing[['Age']])

# fill missing values
df.loc[df['Salary'].isnull(), 'Salary'] = predicted_salaries
print(df)

   Age   Salary
0   25  50000.0
1   30  60000.0
2   35  70000.0
3   40  80000.0
4   45  90000.0


In [None]:
from sklearn.preprocessing import StandardScaler

# sample data
data = {'Age': [18, 25, 30, 45, 60],
        'Income': [20000, 50000, 100000, 200000, 500000]}
df = pd.DataFrame(data)

# standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df)

scaled_df = pd.DataFrame(scaled_features, columns=df.columns)
print(scaled_df)

        Age    Income
0 -1.167023 -0.884648
1 -0.702866 -0.712314
2 -0.371325 -0.425091
3  0.623296  0.149356
4  1.617918  1.872697


In [None]:
from sklearn.preprocessing import PolynomialFeatures

# sample data
data = {'Size': [1000, 1500, 2000], 'Rooms': [3, 4, 5]}
df = pd.DataFrame(data)

# generate polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction_features = poly.fit_transform(df)

interaction_df = pd.DataFrame(interaction_features,
                              columns=poly.get_feature_names_out(['Size', 'Rooms']))
print(interaction_df)

     Size  Rooms  Size Rooms
0  1000.0    3.0      3000.0
1  1500.0    4.0      6000.0
2  2000.0    5.0     10000.0


In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# simulated data
np.random.seed(42)
X = np.random.rand(100, 50)  # 50 features
y = np.random.choice([0, 1], size=100)  # binary target

# feature selection with RFE
model = RandomForestClassifier()
rfe = RFE(estimator=model, n_features_to_select=10)
X_selected = rfe.fit_transform(X, y)

print("Selected Features (Indices):", rfe.get_support(indices=True))

Selected Features (Indices): [ 1 10 14 16 19 29 34 37 38 42]
