### Feature Selection - Using Mutual Information
**Description**: Use mutual information for feature selection to identify important features.

In [1]:
# write your code from here

import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

# --------------------------------------
# Step 1: Sample Dataset
# --------------------------------------
data = {
    'age': [25, 35, 45, 20, 30],
    'income': [50000, 60000, 80000, 30000, 40000],
    'job': ['teacher', 'engineer', 'teacher', 'nurse', 'engineer'],
    'married': ['yes', 'no', 'yes', 'no', 'yes'],
    'target': [1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# --------------------------------------
# Step 2: Validate and Encode
# --------------------------------------
def validate_and_encode(df, target_col):
    if df.empty:
        raise ValueError("‚ùå DataFrame is empty.")
    if target_col not in df.columns:
        raise ValueError(f"‚ùå Target column '{target_col}' not found.")

    df_encoded = df.copy()
    for col in df.columns:
        if df[col].dtype == 'object':
            le = LabelEncoder()
            df_encoded[col] = le.fit_transform(df[col])

    return df_encoded

# --------------------------------------
# Step 3: Mutual Information Selection
# --------------------------------------
def select_features_by_mutual_info(df, target_col, top_k=3):
    df_encoded = validate_and_encode(df, target_col)

    X = df_encoded.drop(columns=[target_col])
    y = df_encoded[target_col]

    mi_scores = mutual_info_classif(X, y, random_state=42)
    mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

    print("üîç Mutual Information Scores:")
    print(mi_series)

    selected_features = mi_series.head(top_k).index.tolist()
    print(f"\n‚úÖ Top {top_k} selected features: {selected_features}")
    return X[selected_features]

# --------------------------------------
# Step 4: Apply Feature Selection
# --------------------------------------
try:
    selected_data = select_features_by_mutual_info(df, target_col='target', top_k=2)
    print("\nüéØ Selected Feature Data:")
    print(selected_data)
except Exception as e:
    print("‚ùå Error:", e)

üîç Mutual Information Scores:
married    1.383333
job        0.083333
income     0.000000
age        0.000000
dtype: float64

‚úÖ Top 2 selected features: ['married', 'job']

üéØ Selected Feature Data:
   married  job
0        1    2
1        0    0
2        1    2
3        0    1
4        1    0
