In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [2]:
train_data = pd.read_csv('./rainfall_train.csv')

test_data = pd.read_csv('./Test_rainfall_test_test.csv')

In [3]:
column_mapping = {
    'rainfall_test.fc_year': 'rainfall_train.fc_year',
    'rainfall_test.fc_month': 'rainfall_train.fc_month',
    'rainfall_test.fc_day': 'rainfall_train.fc_day',
    'rainfall_test.fc_hour': 'rainfall_train.fc_hour',
    'rainfall_test.ef_year': 'rainfall_train.ef_year',
    'rainfall_test.ef_month': 'rainfall_train.ef_month',
    'rainfall_test.ef_day': 'rainfall_train.ef_day',
    'rainfall_test.ef_hour': 'rainfall_train.ef_hour',
    'rainfall_test.dh': 'rainfall_train.dh',
    'rainfall_test.stn4contest': 'rainfall_train.stn4contest',
    'rainfall_test.v01': 'rainfall_train.v01',
    'rainfall_test.v02': 'rainfall_train.v02',
    'rainfall_test.v03': 'rainfall_train.v03',
    'rainfall_test.v04': 'rainfall_train.v04',
    'rainfall_test.v05': 'rainfall_train.v05',
    'rainfall_test.v06': 'rainfall_train.v06',
    'rainfall_test.v07': 'rainfall_train.v07',
    'rainfall_test.v08': 'rainfall_train.v08',
    'rainfall_test.v09': 'rainfall_train.v09'
}

test_data = test_data.rename(columns=column_mapping)

In [4]:
fc_year = 'rainfall_train.fc_year'
fc_month = 'rainfall_train.fc_month'
fc_day = 'rainfall_train.fc_day'
fc_hour = 'rainfall_train.fc_hour'
stn4contest = 'rainfall_train.stn4contest'
dh = 'rainfall_train.dh'
ef_year = 'rainfall_train.ef_year'
ef_month = 'rainfall_train.ef_month'
ef_day = 'rainfall_train.ef_day'
ef_hour = 'rainfall_train.ef_hour'
v01 = 'rainfall_train.v01'
v02 = 'rainfall_train.v02'
v03 = 'rainfall_train.v03'
v04 = 'rainfall_train.v04'
v05 = 'rainfall_train.v05'
v06 = 'rainfall_train.v06'
v07 = 'rainfall_train.v07'
v08 = 'rainfall_train.v08'
v09 = 'rainfall_train.v09'
vv = 'rainfall_train.vv'
class_interval = 'rainfall_train.class_interval'

In [5]:
print(train_data.dtypes)

Unnamed: 0                         int64
rainfall_train.fc_year            object
rainfall_train.fc_month            int64
rainfall_train.fc_day              int64
rainfall_train.fc_hour             int64
rainfall_train.stn4contest        object
rainfall_train.dh                  int64
rainfall_train.ef_year            object
rainfall_train.ef_month            int64
rainfall_train.ef_day              int64
rainfall_train.ef_hour             int64
rainfall_train.v01                 int64
rainfall_train.v02                 int64
rainfall_train.v03                 int64
rainfall_train.v04                 int64
rainfall_train.v05                 int64
rainfall_train.v06                 int64
rainfall_train.v07                 int64
rainfall_train.v08                 int64
rainfall_train.v09                 int64
rainfall_train.vv                float64
rainfall_train.class_interval      int64
dtype: object


In [6]:
print(test_data.dtypes)

rainfall_train.fc_year           object
rainfall_train.fc_month           int64
rainfall_train.fc_day             int64
rainfall_train.fc_hour            int64
rainfall_train.stn4contest       object
rainfall_train.dh                 int64
rainfall_train.ef_year           object
rainfall_train.ef_month           int64
rainfall_train.ef_day             int64
rainfall_train.ef_hour            int64
rainfall_train.v01                int64
rainfall_train.v02                int64
rainfall_train.v03                int64
rainfall_train.v04                int64
rainfall_train.v05                int64
rainfall_train.v06                int64
rainfall_train.v07                int64
rainfall_train.v08                int64
rainfall_train.v09                int64
rainfall_test.class_interval    float64
dtype: object


In [7]:
train_data.isnull().sum()

Unnamed: 0                       0
rainfall_train.fc_year           0
rainfall_train.fc_month          0
rainfall_train.fc_day            0
rainfall_train.fc_hour           0
rainfall_train.stn4contest       0
rainfall_train.dh                0
rainfall_train.ef_year           0
rainfall_train.ef_month          0
rainfall_train.ef_day            0
rainfall_train.ef_hour           0
rainfall_train.v01               0
rainfall_train.v02               0
rainfall_train.v03               0
rainfall_train.v04               0
rainfall_train.v05               0
rainfall_train.v06               0
rainfall_train.v07               0
rainfall_train.v08               0
rainfall_train.v09               0
rainfall_train.vv                0
rainfall_train.class_interval    0
dtype: int64

In [8]:
test_data.isnull().sum()

rainfall_train.fc_year               0
rainfall_train.fc_month              0
rainfall_train.fc_day                0
rainfall_train.fc_hour               0
rainfall_train.stn4contest           0
rainfall_train.dh                    0
rainfall_train.ef_year               0
rainfall_train.ef_month              0
rainfall_train.ef_day                0
rainfall_train.ef_hour               0
rainfall_train.v01                   0
rainfall_train.v02                   0
rainfall_train.v03                   0
rainfall_train.v04                   0
rainfall_train.v05                   0
rainfall_train.v06                   0
rainfall_train.v07                   0
rainfall_train.v08                   0
rainfall_train.v09                   0
rainfall_test.class_interval    121870
dtype: int64

In [18]:
# # 예: 특정 월 필터링
train_data = train_data[(train_data[fc_month].isin([5, 6, 7, 8, 9])) & 
                        (train_data[ef_month].isin([5, 6, 7, 8, 9]))]

test_data = test_data[(test_data[fc_month].isin([5, 6, 7, 8, 9])) & 
                      (test_data[ef_month].isin([5, 6, 7, 8, 9]))]

# 카테고리형 변환
train_data['rainfall_train.fc_year'] = train_data['rainfall_train.fc_year'].astype('category').cat.codes
train_data['rainfall_train.ef_year'] = train_data['rainfall_train.ef_year'].astype('category').cat.codes
train_data['rainfall_train.stn4contest'] = train_data['rainfall_train.stn4contest'].astype('category').cat.codes

test_data['rainfall_train.fc_year'] = test_data['rainfall_train.fc_year'].astype('category').cat.codes
test_data['rainfall_train.ef_year'] = test_data['rainfall_train.ef_year'].astype('category').cat.codes
test_data['rainfall_train.stn4contest'] = test_data['rainfall_train.stn4contest'].astype('category').cat.codes


In [19]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,rainfall_train.fc_year,rainfall_train.fc_month,rainfall_train.fc_day,rainfall_train.fc_hour,rainfall_train.stn4contest,rainfall_train.dh,rainfall_train.ef_year,rainfall_train.ef_month,rainfall_train.ef_day,...,rainfall_train.v02,rainfall_train.v03,rainfall_train.v04,rainfall_train.v05,rainfall_train.v06,rainfall_train.v07,rainfall_train.v08,rainfall_train.v09,rainfall_train.vv,rainfall_train.class_interval
0,1,0,5,1,9,0,3,0,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2,0,5,1,9,0,6,0,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3,0,5,1,9,0,9,0,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,4,0,5,1,9,0,12,0,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,5,0,5,1,9,0,15,0,5,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [20]:
test_data.head()

Unnamed: 0,rainfall_train.fc_year,rainfall_train.fc_month,rainfall_train.fc_day,rainfall_train.fc_hour,rainfall_train.stn4contest,rainfall_train.dh,rainfall_train.ef_year,rainfall_train.ef_month,rainfall_train.ef_day,rainfall_train.ef_hour,...,rainfall_train.v02,rainfall_train.v03,rainfall_train.v04,rainfall_train.v05,rainfall_train.v06,rainfall_train.v07,rainfall_train.v08,rainfall_train.v09,rainfall_test.class_interval,rainfall_train.vv
0,0,5,1,9,0,3,0,5,1,12,...,0.46,0.13,0.01,0.0,0.0,0.0,0.0,0.0,,0.156
1,0,5,1,9,0,6,0,5,1,15,...,0.66,0.26,0.05,0.0,0.0,0.0,0.0,0.0,,0.269
2,0,5,1,9,0,9,0,5,1,18,...,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0175
3,0,5,1,9,0,12,0,5,1,21,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
4,0,5,1,9,0,15,0,5,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


In [21]:
#구간 중간값
intervals = [0.05, 0.15, 0.35, 0.75, 1.5, 3.5, 8.5, 15.0, 25.0]

In [22]:
# 각 확률값을 실제 확률로 (0.0~1.0)
for col in [v01, v02, v03, v04, v05, v06, v07, v08, v09]:
    train_data[col] = train_data[col]/100
    test_data[col] = test_data[col] / 100

In [23]:
# 각 구간 중간값 * 해당 확률 값 합산-> 예상 강수량 계산
test_data [vv] = (
        intervals[0] * test_data['rainfall_train.v01'] +
        intervals[1] * test_data['rainfall_train.v02'] +
        intervals[2] * test_data['rainfall_train.v03'] +
        intervals[3] * test_data['rainfall_train.v04'] +
        intervals[4] * test_data['rainfall_train.v05'] +
        intervals[5] * test_data['rainfall_train.v06'] +
        intervals[6] * test_data['rainfall_train.v07'] +
        intervals[7] * test_data['rainfall_train.v08'] +
        intervals[8] * test_data['rainfall_train.v09']
    )

In [24]:
features = [
    fc_year, fc_month, fc_day, fc_hour, stn4contest, dh,
      ef_year, ef_month, ef_day, ef_hour, 
       v01, v02, v03, v04, v05, v06, v07, v08, v09]

In [25]:
X_train = train_data[features]
y_train = train_data[vv]
X_test = test_data[features]

In [26]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

# 모델
rf = RandomForestRegressor(n_estimators=100, random_state=42)
etr = ExtraTreesRegressor(n_estimators=100, random_state=42)


param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [30]:
# RandomizedSearchCV를 사용한 하이퍼파라미터 튜닝
random_search_rf = RandomizedSearchCV(rf, param_distributions, n_iter=50, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
random_search_rf.fit(X_train_scaled, y_train)
best_rf = random_search_rf.best_estimator_
 
random_search_etr = RandomizedSearchCV(etr, param_distributions, n_iter=50, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
random_search_etr.fit(X_train_scaled, y_train)
best_etr = random_search_etr.best_estimator_

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Fitting 3 folds for each of 50 candidates, totalling 150 fits


KeyboardInterrupt: 

In [None]:
# 모델 학습 및 평가
models = {
    'RandomForest': best_rf,
    'ExtraTrees': best_etr,
}

results = {}

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    
    train_predictions = model.predict(X_train_scaled)
    test_predictions = model.predict(X_test_scaled)
    
    train_mse = mean_squared_error(y_train, train_predictions)
    train_r2 = r2_score(y_train, train_predictions)
    test_mse = mean_squared_error(test_data['rainfall_train.vv'], test_predictions)
    test_r2 = r2_score(test_data['rainfall_train.vv'], test_predictions)
    
    results[model_name] = {
        'train_mse': train_mse,
        'train_r2': train_r2,
        'test_mse': test_mse,
        'test_r2': test_r2
    }
    
    print(f'{model_name} Train MSE: {train_mse}')
    print(f'{model_name} Train R2 Score: {train_r2}')
    print(f'{model_name} Test MSE: {test_mse}')
    print(f'{model_name} Test R2 Score: {test_r2}')

In [None]:
# 모델 앙상블
voting_regressor = VotingRegressor(estimators=[
    ('rf', best_rf),
    ('etr', best_etr)
])
# 학습
voting_regressor.fit(X_train_scaled, y_train)


In [None]:
# 예측
train_predictions = voting_regressor.predict(X_train_scaled)
test_predictions = voting_regressor.predict(X_test_scaled)


In [None]:
# 모델 평가
train_mse = mean_squared_error(y_train, train_predictions)
train_r2 = r2_score(y_train, train_predictions)
test_mse = mean_squared_error(test_data['rainfall_train.vv'], test_predictions)
test_r2 = r2_score(test_data['rainfall_train.vv'], test_predictions)

results['VotingRegressor'] = {
    'train_mse': train_mse,
    'train_r2': train_r2,
    'test_mse': test_mse,
    'test_r2': test_r2
}

# 결과 출력
for model_name, metrics in results.items():
    print(f"{model_name} 최종 모델 Train MSE: {metrics['train_mse']}")
    print(f"{model_name} 최종 모델 Train R2: {metrics['train_r2']}")
    print(f"{model_name} 최종 모델 Test MSE: {metrics['test_mse']}")
    print(f"{model_name} 최종 모델 Test R2: {metrics['test_r2']}")
    
print(f'Voting Regressor Train MSE: {train_mse}')
print(f'Voting Regressor Train R2 Score: {train_r2}')
print(f'Voting Regressor Test MSE: {test_mse}')
print(f'Voting Regressor Test R2 Score: {test_r2}')

In [None]:
print(train_predictions)

In [None]:
test_predictions_series = pd.Series(test_predictions)

In [None]:
# 예측 값 -> class_interval
def vv_to_class_interval(vv):
    if vv <= 0.1:
        return 0
    elif 0.1< vv <= 0.2:
        return 1
    elif 0.2 < vv <= 0.5:
        return 2
    elif 0.5 < vv <= 1.0:
        return 3
    elif 1.0 < vv <= 2.0:
        return 4
    elif 2.0 < vv <= 5.0:
        return 5
    elif 5.0 < vv <= 10.0:
        return 6
    elif 10.0 < vv <= 20.0:
        return 7
    elif 20.0 < vv <= 30.0:
        return 8
    else :
        return 9

In [None]:
test_data[vv] = test_predictions_series
test_data[class_interval] = test_predictions_series.apply(vv_to_class_interval)

In [None]:
test_data[vv]=test_predictions_series
print(test_data[vv])

In [None]:
# 예측 결과에 함수 적용
test_data[class_interval] = test_predictions_series.apply(vv_to_class_interval)

# 결과 확인
print(test_data[class_interval])

In [None]:
# 원래 컬럼명을 키로, 변경된 컬럼명을 값으로 하는 역 매핑 생성
reverse_column_mapping = {v: k for k, v in column_mapping.items()}

# 컬럼명을 원래대로 되돌리기
test_data = test_data.rename(columns=reverse_column_mapping)

In [None]:
test_data.to_csv('./240484.csv', index=False)
