<a href="https://colab.research.google.com/github/makhlufiaero338/tugas-machine-learning/blob/main/tugasperbaikan/Tugas_perbaikan_bab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries (if needed)
!pip install pandas numpy scikit-learn



In [2]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [3]:
# Generate synthetic data
np.random.seed(42)
data = pd.DataFrame({
    'Age': np.random.randint(18, 70, size=100),
    'Gender': np.random.choice(['Male', 'Female'], size=100),
    'Income': np.random.randint(20000, 100000, size=100),
    'Category': np.random.choice(['A', 'B', 'C'], size=100),
    'Target': np.random.choice([0, 1], size=100)
})

print("Sample Data:")
print(data.head())

Sample Data:
   Age Gender  Income Category  Target
0   56   Male   45939        C       1
1   69   Male   68925        A       0
2   46   Male   62941        B       1
3   32   Male   41834        B       0
4   60   Male   38047        C       1


In [4]:
# 1. One-Hot-Encoding (Dummy Variables)
encoder = OneHotEncoder(sparse_output=False)
encoded_categories = encoder.fit_transform(data[['Category']])
category_cols = encoder.get_feature_names_out(['Category'])
encoded_df = pd.DataFrame(encoded_categories, columns=category_cols)

# Add encoded categories back to data
data = data.drop('Category', axis=1)
data = pd.concat([data, encoded_df], axis=1)

In [5]:
# 2. Numbers Can Encode Categoricals
# Example: Convert 'Gender' to 0 (Male) and 1 (Female)
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})

In [6]:
# 3. Binning (Discretization)
data['Age_Bin'] = pd.cut(data['Age'], bins=[0, 25, 50, 75], labels=['Young', 'Middle-aged', 'Senior'])

In [7]:
# 4. Interactions and Polynomials
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(data[['Age', 'Income']])
poly_feature_names = poly.get_feature_names_out(['Age', 'Income'])

poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
data = pd.concat([data, poly_df], axis=1)

In [10]:
# 5. Univariate Nonlinear Transformations
data['Log_Income'] = np.log(data['Income'].iloc[:, 0] + 1 if isinstance(data['Income'], pd.DataFrame) else data['Income'] + 1)

# Split data into features and target
X = data.drop(['Target', 'Age_Bin'], axis=1)
y = data['Target']

In [11]:
# 6. Automatic Feature Selection
# Univariate Statistics
select_k_best = SelectKBest(score_func=f_classif, k=5)
X_selected = select_k_best.fit_transform(X, y)
print("Selected Features (Univariate):")
print(X.columns[select_k_best.get_support()])

# Model-Based Feature Selection
rf_model = RandomForestClassifier()
rf_model.fit(X, y)
important_features = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nFeature Importances (Random Forest):")
print(important_features.head())

Selected Features (Univariate):
Index(['Gender', 'Category_A', 'Category_B', 'Category_C', 'Income'], dtype='object')

Feature Importances (Random Forest):
Log_Income    0.172412
Income        0.154491
Age Income    0.140689
Income        0.139942
Age           0.134565
dtype: float64


In [12]:
# Iterative Feature Selection
log_reg = LogisticRegression(max_iter=1000)
rfe = RFE(log_reg, n_features_to_select=5)
rfe.fit(X, y)
print("\nSelected Features (RFE):")
print(X.columns[rfe.support_])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Selected Features (RFE):
Index(['Gender', 'Category_A', 'Category_B', 'Category_C', 'Log_Income'], dtype='object')


In [13]:
# Summary
print("\nFinal Processed Data Sample:")
print(data.head())


Final Processed Data Sample:
   Age  Gender  Income  Target  Category_A  Category_B  Category_C  \
0   56       0   45939       1         0.0         0.0         1.0   
1   69       0   68925       0         1.0         0.0         0.0   
2   46       0   62941       1         0.0         1.0         0.0   
3   32       0   41834       0         0.0         1.0         0.0   
4   60       0   38047       1         0.0         0.0         1.0   

       Age_Bin   Age   Income  Age Income  Log_Income  
0       Senior  56.0  45939.0   2572584.0   10.735091  
1       Senior  69.0  68925.0   4755825.0   11.140789  
2  Middle-aged  46.0  62941.0   2895286.0   11.049969  
3  Middle-aged  32.0  41834.0   1338688.0   10.641489  
4       Senior  60.0  38047.0   2282820.0   10.546604  
