# HW 5-1 Classification
20220041
Using Ensemble and SVM (not submitted)

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Data loading & selection
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_feature = train_data.drop(columns=['position'])
train_label = train_data['position']

extra_cols = set(train_feature.columns) - set(test_data.columns)
train_data = train_data.drop(columns=extra_cols)

print(train_data.info())

In [None]:
# Ignore tuples with missing values
train_data_clean = train_data.dropna()
train_feature_clean = train_data_clean.drop(columns=['position'])
train_label_clean = train_data_clean['position']

print(train_feature_clean.info())

In [None]:
# Data transformation
train_feature_clean['SEASON_ID'] = train_feature_clean['SEASON_ID'].str[:4].astype(int)

encoder = LabelEncoder()
train_label_encode = encoder.fit_transform(train_label_clean)

print(train_feature_clean.info())

In [None]:
# Compute feature importance using Decision Tree
dt = DecisionTreeClassifier(random_state=10000)
dt.fit(train_feature_clean, train_label_encode)
feature_importances = dt.feature_importances_

low_importance_features = [col for col, importance in zip(train_feature_clean.columns, feature_importances) if importance < 0.03]

print("features with low importance:", low_importance_features)

# Reduce feature with low importance
train_feature_reduced = train_feature_clean.drop(columns=low_importance_features)

print(train_feature_reduced.info())

In [None]:
# Standardize features
scaler = StandardScaler()
train_feature_scaled = scaler.fit_transform(train_feature_reduced)

In [None]:
# Split the data
feature_train, feature_val, label_train, label_val = train_test_split(
    train_feature_scaled,
    train_label_encode,
    test_size=0.2,
    random_state=50
)

In [None]:
# 1. Ensemble -> random forest classifier
rf = RandomForestClassifier(random_state=50)
rf.fit(feature_train, label_train)

label_pred_rf = rf.predict(feature_val)
f1_rf = f1_score(label_val, label_pred_rf, average='weighted')
print("1. ensemble -> random forest model weighted f1 score:", f1_rf)

In [None]:
# Hyperparameter tuning for rf
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(
    RandomForestClassifier(random_state=10000),
    param_grid_rf,
    scoring='f1_weighted',  
    cv=3,  
    n_jobs=-1
)

grid_search_rf.fit(feature_train, label_train)

print("best params for rf:", grid_search_rf.best_params_)

In [None]:
# Prediction with best rf
best_rf = grid_search_rf.best_estimator_
label_pred_best_rf = best_rf.predict(feature_val)

f1_best_rf = f1_score(label_val, label_pred_best_rf, average='weighted')
print("1+. best rf model weighted f1 score:", f1_best_rf)

In [None]:
# 2. SVM
svm = SVC(random_state=50)
svm.fit(feature_train, label_train)

label_pred_svm = svm.predict(feature_val)
f1_svm = f1_score(label_val, label_pred_svm, average='weighted')
print("2. svm weighted f1 score:", f1_svm)

In [None]:
# Hyperparameter tuning for svm
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto']
}

grid_search_svm = GridSearchCV(
    SVC(random_state=10000),
    param_grid_svm,
    scoring='f1_weighted',
    cv=3,
    n_jobs=-1
)

grid_search_svm.fit(feature_train, label_train)
print("best params for svm:", grid_search_svm.best_params_)

In [None]:
# Prediction with best svm
best_svm = grid_search_svm.best_estimator_
label_pred_best_svm = best_svm.predict(feature_val)

f1_best_svm = f1_score(label_val, label_pred_best_svm, average='weighted')
print("2+. best svm weighted f1 score:", f1_best_svm)

Prediction on the test data

In [None]:
# Preprocessing
test_data_clean = test_data.drop(columns=['ID'])
test_data_clean['SEASON_ID'] = test_data_clean['SEASON_ID'].str[:4].astype(int)
test_data_clean_reduced = test_data_clean.drop(columns=low_importance_features)
test_data_scaled = scaler.fit_transform(test_data_clean_reduced)

# Prediction
test_pred_rf = best_rf.predict(test_data_scaled)
test_pred_labels_rf = encoder.inverse_transform(test_pred_rf)

test_pred_svm = best_svm.predict(test_data_scaled)
test_pred_labels_svm = encoder.inverse_transform(test_pred_svm)

result_rf = pd.DataFrame({
    'ID': test_data['ID'],
    'position': test_pred_labels_rf
})
result_svm = pd.DataFrame({
    'ID': test_data['ID'],
    'position': test_pred_labels_svm
})

result_rf.to_csv('rf.csv', index=False)
result_svm.to_csv('svm.csv',index=False)
print("Predictions saved")
