# HW 5-1 Classification
20220041
Using Neural Network (not submitted)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Data loading & selection
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_feature = train_data.drop(columns=['position'])
train_label = train_data['position']

extra_cols = set(train_feature.columns) - set(test_data.columns)
train_data = train_data.drop(columns=extra_cols)
print(extra_cols)
print(train_data.info())

In [None]:
# Ignore tuples with missing values
train_data_clean = train_data.dropna()
train_feature_clean = train_data_clean.drop(columns=['position'])
train_label_clean = train_data_clean['position']

print(train_feature_clean.info())

In [None]:
# Data transformation
train_feature_clean['SEASON_ID'] = train_feature_clean['SEASON_ID'].str[:4].astype(int)

encoder = LabelEncoder()
train_label_encode = encoder.fit_transform(train_label_clean)

print(train_feature_clean.info())

In [None]:
# Compute feature importance using Decision Tree
dt = DecisionTreeClassifier(random_state=10000)
dt.fit(train_feature_clean, train_label_encode)
feature_importances = dt.feature_importances_

low_importance_features = [col for col, importance in zip(train_feature_clean.columns, feature_importances) if importance < 0.03]

print("features with low importance:", low_importance_features)

# Reduce feature with low importance
train_feature_reduced = train_feature_clean.drop(columns=low_importance_features)

print(train_feature_reduced.info())

In [None]:
# Standardize features
scaler = StandardScaler()
train_feature_scaled = scaler.fit_transform(train_feature_reduced)

In [None]:
# Split the data
feature_train, feature_val, label_train, label_val = train_test_split(
    train_feature_scaled,
    train_label_encode,
    test_size=0.2,
    random_state=10000
)

feature_train = torch.tensor(feature_train, dtype=torch.float32)
feature_val = torch.tensor(feature_val, dtype=torch.float32)
label_train = torch.tensor(label_train, dtype=torch.long)  
label_val = torch.tensor(label_val, dtype=torch.long)

In [None]:
# Hyperparameters
input_dim = feature_train.shape[1]             # number of features
hidden_dim = 64                                # number of neurons in the hidden layer
output_dim = len(torch.unique(label_train))    # number of classes
learning_rate = 0.01
epochs = 30
batch_size = 32

In [None]:
# Prepare data for pytorch
train_dataset = TensorDataset(feature_train, label_train)
val_dataset = TensorDataset(feature_val, label_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


In [None]:
# model, loss, and optimizer
model = SimpleNN(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation step
    model.eval()
    val_predictions = []
    val_targets = []

    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            outputs = model(batch_X)
            _, predicted = torch.max(outputs, 1)
            val_predictions.extend(predicted.cpu().numpy())
            val_targets.extend(batch_y.cpu().numpy())      

weighted_f1_nn = f1_score(val_targets, val_predictions, average='weighted')
print("nn f1 score:", weighted_f1_nn)

In [None]:
test_data_clean = test_data.drop(columns=['ID'])
test_data_clean['SEASON_ID'] = test_data_clean['SEASON_ID'].astype(str).str[:4].astype(int)
test_data_clean_reduced = test_data_clean.drop(columns=low_importance_features)
test_features_scaled = scaler.transform(test_data_clean_reduced)
test_features_tensor = torch.tensor(test_features_scaled, dtype=torch.float32)

# Predict using the trained model
model.eval()
with torch.no_grad():
    test_outputs = model(test_features_tensor)
    _, test_pred = torch.max(test_outputs, 1)

test_pred_labels = encoder.inverse_transform(test_pred.numpy())

result = pd.DataFrame({
    'ID': test_data['ID'],
    'position': test_pred_labels
})

result.to_csv('nn4.csv', index=False)
print("Predictions saved to 'nn.csv'")