## Feature Selection

In [None]:
### df만 정의해주기


In [None]:
### Info gain

import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Assuming df is your dataset dataframe
# Specify the target column
# Replace 'target_column' with the actual name of your target column
target_column = 'cooling_power'

# Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# If the target variable is categorical, encode it
if y.dtypes == 'object' or y.dtypes.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Normalize the feature values to ensure fair computation
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Calculate mutual information (Information Gain)
mi_scores = mutual_info_classif(X_scaled, y, random_state=42)

# Create a DataFrame to display feature importance
mi_df = pd.DataFrame({
    'Feature': X.columns,
    'Information Gain': mi_scores
}).sort_values(by='Information Gain', ascending=False)

print("Information Gain for each feature:")
print(mi_df)

# Optional: Select top-k features based on Information Gain
k = 5  # Set the number of top features you want to keep
top_features = mi_df.head(k)['Feature'].tolist()
print("Top features:", top_features)


In [None]:
### CFS

import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from itertools import combinations

# Assuming df is your dataset dataframe
# Specify the target column
# Replace 'target_column' with the actual name of your target column
target_column = 'target_column'

# Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# If the target variable is categorical, encode it
if y.dtypes == 'object' or y.dtypes.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Function to calculate CFS score
def cfs_score(features, target):
    num_features = len(features.columns)
    if num_features == 0:
        return 0

    correlations = []
    # Correlation between each feature and target
    for feature in features.columns:
        corr = np.corrcoef(features[feature], target)[0, 1]
        correlations.append(abs(corr))

    feature_corr_sum = sum(correlations)

    # Pairwise correlation among features
    pairwise_corr_sum = 0
    for feature_pair in combinations(features.columns, 2):
        pair_corr = np.corrcoef(features[feature_pair[0]], features[feature_pair[1]])[0, 1]
        pairwise_corr_sum += abs(pair_corr)

    avg_feature_corr = feature_corr_sum / num_features
    avg_pairwise_corr = (pairwise_corr_sum / (num_features * (num_features - 1) / 2)) if num_features > 1 else 0

    # CFS formula
    return avg_feature_corr / np.sqrt(avg_pairwise_corr) if avg_pairwise_corr > 0 else avg_feature_corr

# Calculate CFS score for all features
cfs_scores = []
for i in range(1, len(X.columns) + 1):
    for feature_subset in combinations(X.columns, i):
        subset_df = X[list(feature_subset)]
        score = cfs_score(subset_df, y)
        cfs_scores.append((feature_subset, score))

# Sort subsets by CFS score
top_cfs_scores = sorted(cfs_scores, key=lambda x: x[1], reverse=True)

# Display top subset
print("Top feature subsets by CFS score:")
for subset, score in top_cfs_scores[:5]:  # Display top 5 subsets
    print(f"Features: {subset}, CFS Score: {score}")



In [None]:
### CSS

import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from itertools import combinations

# Assuming df is your dataset dataframe
# Specify the target column
# Replace 'target_column' with the actual name of your target column
target_column = 'target_column'

# Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# If the target variable is categorical, encode it
if y.dtypes == 'object' or y.dtypes.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Function to calculate CSS score
def css_score(features, target):
    feature_subsets = features.values.tolist()
    unique_combinations = set(tuple(row) for row in feature_subsets)

    # Calculate consistency
    consistency = 0
    for combination in unique_combinations:
        indices = [i for i, row in enumerate(feature_subsets) if tuple(row) == combination]
        target_values = target[indices]
        most_common = max(np.bincount(target_values))
        consistency += most_common

    return consistency / len(target)

# Calculate CSS score for all feature subsets
css_scores = []
for i in range(1, len(X.columns) + 1):
    for feature_subset in combinations(X.columns, i):
        subset_df = X[list(feature_subset)]
        score = css_score(subset_df, y)
        css_scores.append((feature_subset, score))

# Sort subsets by CSS score
top_css_scores = sorted(css_scores, key=lambda x: x[1], reverse=True)

# Display top subset
print("Top feature subsets by CSS score:")
for subset, score in top_css_scores[:5]:  # Display top 5 subsets
    print(f"Features: {subset}, CSS Score: {score}")
