In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split #split_dataset을 위하여 : train_test_split 함수 사용을 위하여
from sklearn.tree import DecisionTreeRegressor #결정 트리 회귀 모델을 위하여
from sklearn.ensemble import RandomForestRegressor #랜덤 포레스트 회귀 모델을 위하여
from sklearn.pipeline import Pipeline #svm 모델을 위하여 : 파이프라인
from sklearn.preprocessing import StandardScaler #svm 모델을 위하여 : standard scaler
from sklearn.svm import SVR #svm 모델을 위하여
import numpy as np #calculate_RMSE를 위하여



def sort_dataset(dataset_df):
    dataset_sorted = dataset_df.sort_values(by='year')
    return dataset_sorted

def split_dataset(dataset_df):
    X = dataset_df.drop(columns="salary", axis=1)
    y = dataset_df["salary"]*0.001
    X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=1718)
    return X_train, X_test, Y_train, Y_test
    
    
def extract_numerical_cols(dataset_df):
    numCols = dataset_df[['age', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B','HR','RBI','SB','CS','BB','HBP', 'SO','GDP','fly','war']]
    return numCols

def train_predict_decision_tree(X_train, Y_train, X_test):
    # 결정 트리 회귀 모델 생성
    regressor = DecisionTreeRegressor()
    # 모델을 X_train과 Y_train으로 훈련
    regressor.fit(X_train, Y_train)
    # X_test에 대한 예측 수행
    prediction = regressor.predict(X_test)
    # 예측 결과 반환
    return prediction

def train_predict_random_forest(X_train, Y_train, X_test):
    # 랜덤 포레스트 회귀 모델 생성
    regressor = RandomForestRegressor()
    # 모델을 X_train과 Y_train으로 훈련
    regressor.fit(X_train, Y_train)
    # X_test에 대한 예측 수행
    prediction = regressor.predict(X_test)
    # 예측 결과 반환
    return prediction


def train_predict_svm(X_train, Y_train, X_test):
    # 표준 스케일러와 SVM 모델을 포함하는 파이프라인 생성
    pipeline = Pipeline([('scaler', StandardScaler()), ('svm', SVR())]) #여기도 확인하기
    # 파이프라인을 X_train과 Y_train으로 훈련
    pipeline.fit(X_train, Y_train)
    # X_test에 대한 예측 수행
    prediction = pipeline.predict(X_test)
    # 예측 결과 반환
    return prediction

def calculate_RMSE(labels, predictions):
    return np.sqrt(np.mean((predictions-labels)**2))

if __name__=='__main__':
	#DO NOT MODIFY THIS FUNCTION UNLESS PATH TO THE CSV MUST BE CHANGED.
	data_df = pd.read_csv('2019_kbo_for_kaggle_v2.csv')
	
	sorted_df = sort_dataset(data_df)	
	X_train, X_test, Y_train, Y_test = split_dataset(sorted_df)
	
	X_train = extract_numerical_cols(X_train)
	X_test = extract_numerical_cols(X_test)

	dt_predictions = train_predict_decision_tree(X_train, Y_train, X_test)
	rf_predictions = train_predict_random_forest(X_train, Y_train, X_test)
	svm_predictions = train_predict_svm(X_train, Y_train, X_test)
	
	print ("Decision Tree Test RMSE: ", calculate_RMSE(Y_test, dt_predictions))	
	print ("Random Forest Test RMSE: ", calculate_RMSE(Y_test, rf_predictions))	
	print ("SVM Test RMSE: ", calculate_RMSE(Y_test, svm_predictions))

Decision Tree Test RMSE:  22.167152090327104
Random Forest Test RMSE:  16.87020185692062
SVM Test RMSE:  21.299459872180236
