<font size="10"> Filter Methods for Feature Selection Method</font>

<span style="color:red; font-size:24px; font-weight:bold;">Filter Method 1: Computing and Show  Chi square values</span>

In [37]:
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.datasets import load_iris

# Load example data (replace this with your own data)
data = load_iris()
X = data.data  # Features
y = data.target  # Target variable

# Feature selection using Chi-square test
# Select the top k features based on the Chi-square statistic
#k = 4  # Number of top features to select, i.e., k=2 will selct top two features based on chi square values
k=X.shape[1] # counts number of features
selector = SelectKBest(score_func=chi2, k=k) 
X_new = selector.fit_transform(X, y)

# Print the selected features and their Chi-square scores
print("Selected features:")
selected_features = []
for i, feature_idx in enumerate(selector.get_support(indices=True)):
    selected_features.append(feature_idx)
    print(f"Feature {feature_idx}: Chi-square score = {selector.scores_[feature_idx]}")

Selected features:
Feature 0: Chi-square score = 10.817820878494002
Feature 1: Chi-square score = 3.7107283035324965
Feature 2: Chi-square score = 116.31261309207025
Feature 3: Chi-square score = 67.04836020011118


<span style="color:red; font-size:24px; font-weight:bold;">Filter Method 1.2: Computing and Show P values</span>

In [42]:
from sklearn.datasets import load_iris
from scipy.stats import chi2_contingency

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Perform chi-square tests for each feature
p_values = []
for i in range(X.shape[1]):
    observed = X[:, i]
    contingency_table = []
    for unique_class in set(y):
        contingency_table.append([(observed[y == unique_class] == val).sum() for val in set(observed)])
    _, p, _, _ = chi2_contingency(contingency_table)
    p_values.append(p)

# Print the p-values for each feature
for i, p_value in enumerate(p_values):
    p_value= "{:.10f}".format(p_value) #show 10 digits after decimal
    print(f"Feature {i}: p-value = {p_value}")

Feature 0: p-value = 0.0000000067
Feature 1: p-value = 0.0000601603
Feature 2: p-value = 0.0000000000
Feature 3: p-value = 0.0000000000


 <span style="color:red; font-size:24px; font-weight:bold;">Filter Method 2:Feature Selection by Pearson Correlation coefficient scores</span>

In [43]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from scipy.stats import pearsonr

# Load the Iris dataset
iris = load_iris()
data = iris.data
target = iris.target

# Create a DataFrame
df = pd.DataFrame(data, columns=iris.feature_names)
df['target'] = target

# Extract features and target
features = df.drop(columns=['target'])
target = df['target']

# Function to calculate Pearson correlation coefficient for each feature
def compute_pearson_correlation(features, target):
    correlations = {}
    for column in features.columns:
        corr, _ = pearsonr(features[column], target)
        correlations[column] = corr
    return correlations

# Compute Pearson correlation coefficient for each feature
correlations = compute_pearson_correlation(features, target)

# Print the Pearson correlation coefficients
for feature, correlation in correlations.items():
    print(f'Feature: {feature}, Pearson Correlation: {correlation}')


Feature: sepal length (cm), Pearson Correlation: 0.7825612318100812
Feature: sepal width (cm), Pearson Correlation: -0.426657560781124
Feature: petal length (cm), Pearson Correlation: 0.9490346990083889
Feature: petal width (cm), Pearson Correlation: 0.9565473328764029



<span style="color:red; font-size:24px; font-weight:bold;">Filter Method 3:Feature Selection by Variance Thresold</span>


In [45]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_selection import VarianceThreshold

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Convert to DataFrame for better visualization (optional)
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y

# Print original feature variances
print("Original feature variances:")
print(df.var())

# Define a variance threshold
threshold = 0.5

# Apply VarianceThreshold
selector = VarianceThreshold(threshold=threshold)
X_high_variance = selector.fit_transform(X)

# Print the shape of the dataset before and after variance thresholding
print("Original shape:", X.shape)
print("Shape after Variance Threshold:", X_high_variance.shape)

# Optionally, print the variances of selected features
selected_features = selector.get_support(indices=True)
print("Selected feature indices:", selected_features)
print("Selected feature variances:", df.var().iloc[selected_features])


Original feature variances:
sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
target               0.671141
dtype: float64
Original shape: (150, 4)
Shape after Variance Threshold: (150, 3)
Selected feature indices: [0 2 3]
Selected feature variances: sepal length (cm)    0.685694
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64



<span style="color:red; font-size:24px; font-weight:bold;">Find Missing Values</span>


In [None]:
import pandas as pd

# Load your dataset into a pandas DataFrame
# For example, if you have a CSV file:
# df = pd.read_csv('your_dataset.csv')

# Creating a sample DataFrame for demonstration
data = {
    'A': [1, 2, None, 4],
    'B': [None, 2, 3, 4],
    'C': [1, None, None, 4],
    'D': [1, 2, 3, 4]
}
df = pd.DataFrame(data)

# Find missing values
missing_values = df.isnull()

# Print the DataFrame showing missing values (True indicates missing)
print("Missing Values in the DataFrame:")
print(missing_values)

# Alternatively, you can get the count of missing values per column
missing_count_per_column = df.isnull().sum()

print("\nCount of Missing Values per Column:")
print(missing_count_per_column)

# If you want to get the count of missing values in the entire DataFrame
total_missing = df.isnull().sum().sum()

print("\nTotal Number of Missing Values in the DataFrame:")
print(total_missing)


In [None]:
# Fill missing values (null) with the mean of their respective columns
df.fillna(df.mean(), inplace=True)

print("\nDataFrame after filling missing values with mean:")
print(df)

In [None]:
import pandas as pd

# Load your dataset into a pandas DataFrame
# For example, if you have a CSV file:
# df = pd.read_csv('your_dataset.csv')

# Creating a sample DataFrame for demonstration
data = {
    'A': [1, 2, None, 4],
    'B': [None, 2, 3, 4],
    'C': [1, None, None, 4],
    'D': [1, 2, 3, 4]
}
df = pd.DataFrame(data)

# Select the column you want to check for NaN values
column_name = 'A'

# Find the total number of NaN values in the column
nan_count = df[column_name].isnull().sum()

# Print the total number of NaN values in the column
print(f"Total number of NaN values in column '{column_name}': {nan_count}")

# Fill NaN values in the column with zero
df[column_name].fillna(0, inplace=True)

print(f"\nDataFrame after filling NaN values in column '{column_name}' with zero:")
print(df)





<span style="color:red; font-size:24px; font-weight:bold;">Dimensionality Reduction Using Principal Component Analysis (PCA)</span>
aont>

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Sample dataset
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [2, 3, 4, 5, 6],
    'C': [5, 6, 7, 8, 9],
    'D': [3, 4, 5, 6, 7]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

print("\nStandardized Data:")
print(scaled_data)

# Perform PCA
pca = PCA(n_components=2)  # Number of components to keep
principal_components = pca.fit_transform(scaled_data)

# Create a DataFrame with the principal components
principal_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])

print("\nDataFrame with Principal Components:")
print(principal_df)

# Explained variance
print("\nExplained Variance Ratio:")
print(pca.explained_variance_ratio_)

# Cumulative explained variance
print("\nCumulative Explained Variance Ratio:")
print(pca.explained_variance_ratio_.cumsum())


<span style="color:red; font-size:24px; font-weight:bold;">Dimensionality Reduction Using Linear Discriminant Analysis  (LDA)</span>

In [None]:
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Perform LDA
lda = LDA(n_components=2)  # Number of components to keep
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

# Create DataFrames for better visualization
train_df = pd.DataFrame(data=X_train_lda, columns=['LDA1', 'LDA2'])
train_df['target'] = y_train

test_df = pd.DataFrame(data=X_test_lda, columns=['LDA1', 'LDA2'])
test_df['target'] = y_test

print("Training set after LDA:")
print(train_df.head())

print("\nTesting set after LDA:")
print(test_df.head())

# Explained variance ratio
explained_variance_ratio = lda.explained_variance_ratio_
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)
