# HW 5-1 Classification
20220041

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [30]:
# Data loading & selection
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_feature = train_data.drop(columns=['position'])
train_label = train_data['position']

extra_cols = set(train_feature.columns) - set(test_data.columns)
train_data = train_data.drop(columns=extra_cols)

print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEASON_ID   15000 non-null  object 
 1   TEAM_ID     15000 non-null  int64  
 2   PLAYER_AGE  15000 non-null  float64
 3   FGM         15000 non-null  int64  
 4   FGA         15000 non-null  int64  
 5   FG_PCT      14994 non-null  float64
 6   FG3M        12057 non-null  float64
 7   FG3A        12057 non-null  float64
 8   FG3_PCT     11955 non-null  float64
 9   FTM         15000 non-null  int64  
 10  FTA         15000 non-null  int64  
 11  FT_PCT      14952 non-null  float64
 12  OREB        12931 non-null  float64
 13  DREB        12931 non-null  float64
 14  REB         14662 non-null  float64
 15  AST         15000 non-null  int64  
 16  STL         12931 non-null  float64
 17  BLK         12931 non-null  float64
 18  TOV         12399 non-null  float64
 19  PF          15000 non-nul

In [31]:
# Ignore tuples with missing values
train_data_clean = train_data.dropna()
train_feature_clean = train_data_clean.drop(columns=['position'])
train_label_clean = train_data_clean['position']

print(train_feature_clean.info())

<class 'pandas.core.frame.DataFrame'>
Index: 11862 entries, 0 to 14999
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEASON_ID   11862 non-null  object 
 1   TEAM_ID     11862 non-null  int64  
 2   PLAYER_AGE  11862 non-null  float64
 3   FGM         11862 non-null  int64  
 4   FGA         11862 non-null  int64  
 5   FG_PCT      11862 non-null  float64
 6   FG3M        11862 non-null  float64
 7   FG3A        11862 non-null  float64
 8   FG3_PCT     11862 non-null  float64
 9   FTM         11862 non-null  int64  
 10  FTA         11862 non-null  int64  
 11  FT_PCT      11862 non-null  float64
 12  OREB        11862 non-null  float64
 13  DREB        11862 non-null  float64
 14  REB         11862 non-null  float64
 15  AST         11862 non-null  int64  
 16  STL         11862 non-null  float64
 17  BLK         11862 non-null  float64
 18  TOV         11862 non-null  float64
 19  PF          11862 non-null  in

In [32]:
# Data transformation
train_feature_clean['SEASON_ID'] = train_feature_clean['SEASON_ID'].str[:4].astype(int)

encoder = LabelEncoder()
train_label_encode = encoder.fit_transform(train_label_clean)

print(train_feature_clean.info())

<class 'pandas.core.frame.DataFrame'>
Index: 11862 entries, 0 to 14999
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEASON_ID   11862 non-null  int64  
 1   TEAM_ID     11862 non-null  int64  
 2   PLAYER_AGE  11862 non-null  float64
 3   FGM         11862 non-null  int64  
 4   FGA         11862 non-null  int64  
 5   FG_PCT      11862 non-null  float64
 6   FG3M        11862 non-null  float64
 7   FG3A        11862 non-null  float64
 8   FG3_PCT     11862 non-null  float64
 9   FTM         11862 non-null  int64  
 10  FTA         11862 non-null  int64  
 11  FT_PCT      11862 non-null  float64
 12  OREB        11862 non-null  float64
 13  DREB        11862 non-null  float64
 14  REB         11862 non-null  float64
 15  AST         11862 non-null  int64  
 16  STL         11862 non-null  float64
 17  BLK         11862 non-null  float64
 18  TOV         11862 non-null  float64
 19  PF          11862 non-null  in

In [33]:
# Compute feature importance using Decision Tree
dt = DecisionTreeClassifier(random_state=10000)
dt.fit(train_feature_clean, train_label_encode)
feature_importances = dt.feature_importances_

low_importance_features = [col for col, importance in zip(train_feature_clean.columns, feature_importances) if importance < 0.03]

print("features with low importance:", low_importance_features)

# Reduce feature with low importance
train_feature_reduced = train_feature_clean.drop(columns=low_importance_features)

print(train_feature_reduced.info())

features with low importance: ['TEAM_ID', 'FGM', 'FG3M', 'FG3_PCT', 'FTM', 'FTA', 'PTS']
<class 'pandas.core.frame.DataFrame'>
Index: 11862 entries, 0 to 14999
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEASON_ID   11862 non-null  int64  
 1   PLAYER_AGE  11862 non-null  float64
 2   FGA         11862 non-null  int64  
 3   FG_PCT      11862 non-null  float64
 4   FG3A        11862 non-null  float64
 5   FT_PCT      11862 non-null  float64
 6   OREB        11862 non-null  float64
 7   DREB        11862 non-null  float64
 8   REB         11862 non-null  float64
 9   AST         11862 non-null  int64  
 10  STL         11862 non-null  float64
 11  BLK         11862 non-null  float64
 12  TOV         11862 non-null  float64
 13  PF          11862 non-null  int64  
dtypes: float64(10), int64(4)
memory usage: 1.4 MB
None


In [34]:
# Standardize features
scaler = StandardScaler()
train_feature_scaled = scaler.fit_transform(train_feature_reduced)

In [35]:
# Split the data
feature_train, feature_val, label_train, label_val = train_test_split(
    train_feature_scaled,
    train_label_encode,
    test_size=0.2,
    random_state=10000
)

In [36]:
# 1. XGBoost
xgb = XGBClassifier(
    objective='multi:softmax',  # multiclass classification
    num_class=len(set(label_train)),  # number of classes
    random_state=50
)

xgb.fit(feature_train, label_train)

label_pred_xgb = xgb.predict(feature_val)

f1_xgb = f1_score(label_val, label_pred_xgb, average='weighted')
print("1. xgb f1 score:", f1_xgb)

1. xgb f1 score: 0.6388963922500461


In [37]:
# Hyperparameter tuning for xgb
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200]
}

grid_search = GridSearchCV(
    XGBClassifier(objective='multi:softmax', num_class=len(set(label_train)), random_state=50),
    param_grid,
    scoring='f1_weighted',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(feature_train, label_train)
print("best params for xgb:", grid_search.best_params_)

# Prediction with best xgb
best_xgb = grid_search.best_estimator_
label_pred_best_xgb = best_xgb.predict(feature_val)
f1_best_xgb = f1_score(label_val, label_pred_best_xgb, average='weighted')

print("1+. best xgb f1 score:", f1_best_xgb)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
best params for xgb: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100}
1+. best xgb f1 score: 0.6484065377377887


In [38]:
# 2. Random forest
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

rf = RandomForestClassifier(random_state=50)
rf.fit(feature_train, label_train)

label_pred_rf = rf.predict(feature_val)
f1_rf = f1_score(label_val, label_pred_rf, average='weighted')
print("2. rf f1 score:", f1_rf)

2. rf f1 score: 0.6254862275851781


In [39]:
# Hyperparameter tuning for rf
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30]
}

grid_search_rf = GridSearchCV(
    RandomForestClassifier(random_state=10000),
    param_grid_rf,
    scoring='f1_weighted',  
    cv=3,  
    n_jobs=-1
)

grid_search_rf.fit(feature_train, label_train)

print("best params for rf:", grid_search_rf.best_params_)

# Prediction with best rf
best_rf = grid_search_rf.best_estimator_
label_pred_best_rf = best_rf.predict(feature_val)

f1_best_rf = f1_score(label_val, label_pred_best_rf, average='weighted')
print("2+. best rf f1 score:", f1_best_rf)

best params for rf: {'max_depth': 30, 'n_estimators': 200}
2+. best rf f1 score: 0.6273757503726072


In [40]:
# Prediction on the test data
test_data_clean = test_data.drop(columns=['ID'])
test_data_clean['SEASON_ID'] = test_data_clean['SEASON_ID'].astype(str).str[:4].astype(int)
test_data_clean_reduced = test_data_clean.drop(columns=low_importance_features)
test_features_scaled = scaler.transform(test_data_clean_reduced)  # Scale test data

test_pred = best_xgb.predict(test_features_scaled)

test_pred_labels = encoder.inverse_transform(test_pred)

result = pd.DataFrame({
    'ID': test_data['ID'],
    'position': test_pred_labels
})

result.to_csv('xgb6.csv', index=False)
print("Predictions saved to 'xgb.csv'")

Predictions saved to 'xgb.csv'
