<a href="https://colab.research.google.com/github/mint-aguccim/Machine_Learning_Programming/blob/main/3%EC%A3%BC%EC%B0%A8/weather_Rainfall_upload.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -----------------------------
# 1) 데이터 준비
# -----------------------------
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/weather.csv", index_col=0)
df


Unnamed: 0_level_0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11/1/2007,Canberra,8.0,24.3,0.0,3.4,6.3,NW,30.0,SW,NW,...,29,1019.7,1015.0,7,7,14.4,23.6,No,3.6,Yes
11/2/2007,Canberra,14.0,26.9,3.6,4.4,9.7,ENE,39.0,E,W,...,36,1012.4,1008.4,5,3,17.5,25.7,Yes,3.6,Yes
11/3/2007,Canberra,13.7,23.4,3.6,5.8,3.3,NW,85.0,N,NNE,...,69,1009.5,1007.2,8,7,15.4,20.2,Yes,39.8,Yes
11/4/2007,Canberra,13.3,15.5,39.8,7.2,9.1,NW,54.0,WNW,W,...,56,1005.5,1007.0,2,7,13.5,14.1,Yes,2.8,Yes
11/5/2007,Canberra,7.6,16.1,2.8,5.6,10.6,SSE,50.0,SSE,ESE,...,49,1018.3,1018.5,7,7,11.1,15.4,Yes,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10/27/2008,Canberra,9.0,30.7,0.0,7.6,12.1,NNW,76.0,SSE,NW,...,15,1016.1,1010.8,1,3,20.4,30.0,No,0.0,No
10/28/2008,Canberra,7.1,28.4,0.0,11.6,12.7,N,48.0,NNW,NNW,...,22,1020.0,1016.9,0,1,17.2,28.2,No,0.0,No
10/29/2008,Canberra,12.5,19.9,0.0,8.4,5.3,ESE,43.0,ENE,ENE,...,47,1024.0,1022.8,3,2,14.5,18.3,No,0.0,No
10/30/2008,Canberra,12.5,26.9,0.0,5.0,7.1,NW,46.0,SSW,WNW,...,39,1021.0,1016.2,6,7,15.8,25.9,No,0.0,No


In [11]:
# optional: 'Date' 컬럼이 있으면 파싱해서 유용한 feature로 변환하거나 제거
if 'Date' in df.columns:
    try:
        df['Date'] = pd.to_datetime(df['Date'])
        # 예: 월/일 같은 파생변수 추가(필요하면)
        df['month'] = df['Date'].dt.month
        # 필요 없으면 제거: df = df.drop(columns=['Date'])
    except Exception:
        # 파싱 실패 시 안전하게 제거
        df = df.drop(columns=['Date'])

In [12]:
# -----------------------------
# 2) 결측치 & 비수치 처리
# -----------------------------
# Rainfall에 'T' (trace) 같은 문자열이 있을 수 있으므로 처리
if 'Rainfall' in df.columns:
    df['Rainfall'] = df['Rainfall'].replace('T', 0.0)            # 'T'를 0으로 취급 (선택)
    df['Rainfall'] = pd.to_numeric(df['Rainfall'], errors='coerce')

In [13]:
# 다른 컬럼들 중 숫자로 바꿀 수 있는 것은 강제 변환 (문자열 남는 것을 방지)
# (주의: 여기선 get_dummies 하기 전에 Rainfall 처리만 우선)
# 전체 결측치 제거(또는 원하는 대체법으로 변경 가능)
df = df.dropna()

In [14]:
# -----------------------------
# 3) 입력(X), 타깃(y) 분리
# -----------------------------
target = "Rainfall"
if target not in df.columns:
    raise ValueError(f"타깃 컬럼 '{target}' 이 데이터프레임에 없습니다. 컬럼 목록: {df.columns.tolist()}")


In [15]:
# 'RainTomorrow'는 예시대로 사용하지 않으므로 입력에서 제거
if 'RainTomorrow' in df.columns:
    X = df.drop(columns=["RainTomorrow", target])
else:
    X = df.drop(columns=[target])
y = df[target].astype(float)

In [16]:
# 범주형 컬럼 → One-Hot
X = pd.get_dummies(X, drop_first=True)

In [17]:
# 혹시라도 남아있는 object 타입 컬럼이 있으면 강제 변환 (불가능하면 NaN)
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = pd.to_numeric(X[col], errors='coerce')

In [18]:
# 변환으로 생긴 NaN 행 제거 (X 또는 y에 NaN 존재 시)
mask = (~X.isnull().any(axis=1)) & (~y.isnull())
X = X.loc[mask]
y = y.loc[mask]

In [19]:
# -----------------------------
# 4) 데이터 분할
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [20]:
# -----------------------------
# 5) 모델 정의
# -----------------------------
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
lr = make_pipeline(StandardScaler(), LinearRegression())


In [21]:
# -----------------------------
# 6) 모델 학습
# -----------------------------
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

In [22]:
# 예측
dt_pred = dt.predict(X_test)
rf_pred = rf.predict(X_test)
lr_pred = lr.predict(X_test)

In [23]:
# -----------------------------
# 7) 평가 함수 정의 및 실행
# -----------------------------
def evaluate_model(y_true, y_pred, name="Model"):
    y_true = np.array(y_true).astype(float)
    y_pred = np.array(y_pred).astype(float)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))   # sklearn 버전 이슈 회피
    r2 = r2_score(y_true, y_pred)
    print(f"--- {name} ---")
    print(f"MAE : {mae:.6f}")
    print(f"RMSE: {rmse:.6f}")
    print(f"R2  : {r2:.6f}")
    print()

evaluate_model(y_test, dt_pred, "Decision Tree Regressor")
evaluate_model(y_test, rf_pred, "Random Forest Regressor")
evaluate_model(y_test, lr_pred, "Linear Regression")

--- Decision Tree Regressor ---
MAE : 2.072727
RMSE: 5.414795
R2  : 0.183006

--- Random Forest Regressor ---
MAE : 1.740773
RMSE: 4.623022
R2  : 0.404465

--- Linear Regression ---
MAE : 2.384068
RMSE: 5.003486
R2  : 0.302410

