# Feature Engineering

#### 1. Missing Value Imputation

Replace missing values in the "Age" column with the median age.

In [None]:
import pandas as pd
df = pd.read_csv('titanic.csv')
df['Age'].fillna(df['Age'].median(), inplace=True)


#### 2. Encoding Categorical Variables

Use one-hot encoding on the "Species" column.

In [None]:
df = pd.read_csv('iris.csv')
df_encoded = pd.get_dummies(df, columns=['Species'])


#### 3. Binning

Bin the "AGE" variable into categories like "Old" and "New" homes.

In [None]:
df = pd.read_csv('boston_housing.csv')
df['Age_Bin'] = pd.cut(df['AGE'], bins=[0, 35, 100], labels=['New', 'Old'])


#### 4. Feature Scaling

Standardize the "alcohol" column.

In [None]:
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('winequality.csv')
scaler = StandardScaler()
df['alcohol_scaled'] = scaler.fit_transform(df[['alcohol']])


#### 5. Polynomial and Interaction Features

Add interaction between "TV" and "Radio" advertising budgets.

In [None]:
df = pd.read_csv('advertising.csv')
df['TV_Radio_Interaction'] = df['TV'] * df['Radio']


#### 6. Log Transform and Power Transform

Apply log transform to the "SalePrice" to reduce skewness.

In [None]:
import numpy as np
df = pd.read_csv('house_prices.csv')
df['SalePrice_Log'] = np.log(df['SalePrice'] + 1)


#### 7. Date and Time Extraction

Extract month and year from the "date" column.

In [None]:
df = pd.read_csv('retail_sales.csv')
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month


#### 8. Text-Based Feature Engineering

Use TF-IDF on email content.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
df = pd.read_csv('spam_emails.csv')
tfidf = TfidfVectorizer()
tfidf_features = tfidf.fit_transform(df['email_content'])


#### 9. Aggregations and Grouping

Calculate the average rating per movie.

In [None]:
df = pd.read_csv('movie_ratings.csv')
avg_rating = df.groupby('movieId')['rating'].mean().reset_index()
avg_rating.columns = ['movieId', 'average_rating']


#### 10. Dimensionality Reduction Techniques

Use PCA to reduce the number of pixel features.

In [None]:
from sklearn.decomposition import PCA
df = pd.read_csv('mnist.csv')
pca = PCA(n_components=50)
df_pca = pca.fit_transform(df.iloc[:, 1:])  # Assuming pixel data starts from column 1


#### 11. Feature Selection

Use correlation threshold to filter features.

In [None]:
df = pd.read_csv('breast_cancer.csv')
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_filtered = df.drop(columns=to_drop)


#### 12. Target Transformation

Apply square root transformation to the "count" variable to reduce skewness.

In [None]:
df = pd.read_csv('bike_sharing.csv')
df['count_sqrt'] = np.sqrt(df['count'])
