<a href="https://colab.research.google.com/github/mabataki2/AI-Class/blob/main/Week3/weather_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score

# 1) 데이터 준비
df = pd.read_csv("/content/drive/MyDrive/weather.csv").dropna()

# 불필요한 열 제거
df = df.drop(columns=["RISK_MM", "Date", "Location", "RainTomorrow"])
df

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,8.0,24.3,0.0,3.4,6.3,NW,30.0,SW,NW,6.0,20,68,29,1019.7,1015.0,7,7,14.4,23.6,No
1,14.0,26.9,3.6,4.4,9.7,ENE,39.0,E,W,4.0,17,80,36,1012.4,1008.4,5,3,17.5,25.7,Yes
2,13.7,23.4,3.6,5.8,3.3,NW,85.0,N,NNE,6.0,6,82,69,1009.5,1007.2,8,7,15.4,20.2,Yes
3,13.3,15.5,39.8,7.2,9.1,NW,54.0,WNW,W,30.0,24,62,56,1005.5,1007.0,2,7,13.5,14.1,Yes
4,7.6,16.1,2.8,5.6,10.6,SSE,50.0,SSE,ESE,20.0,28,68,49,1018.3,1018.5,7,7,11.1,15.4,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,9.0,30.7,0.0,7.6,12.1,NNW,76.0,SSE,NW,7.0,50,38,15,1016.1,1010.8,1,3,20.4,30.0,No
362,7.1,28.4,0.0,11.6,12.7,N,48.0,NNW,NNW,2.0,19,45,22,1020.0,1016.9,0,1,17.2,28.2,No
363,12.5,19.9,0.0,8.4,5.3,ESE,43.0,ENE,ENE,11.0,9,63,47,1024.0,1022.8,3,2,14.5,18.3,No
364,12.5,26.9,0.0,5.0,7.1,NW,46.0,SSW,WNW,6.0,28,69,39,1021.0,1016.2,6,7,15.8,25.9,No


In [14]:
# 결측치 확인
df.isnull().sum()

Unnamed: 0,0
MinTemp,0
MaxTemp,0
Rainfall,0
Evaporation,0
Sunshine,0
WindGustDir,0
WindGustSpeed,0
WindDir9am,0
WindDir3pm,0
WindSpeed9am,0


In [15]:
# 컬럼명 확인
df.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday'],
      dtype='object')

In [16]:
# 모든 레이블을 숫자로 변형
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in df.columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [8]:
# 모든 컬럼 값이 숫자로 변경되었는지 확인
df

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,90,111,0,15,43,7,8,12,7,2,...,29,15,93,82,7,7,95,129,0,1
1,138,130,16,20,75,1,12,0,13,1,...,41,22,37,27,5,3,123,145,1,1
2,135,106,16,27,24,7,33,3,5,2,...,43,55,20,22,8,7,103,99,1,1
3,133,42,43,34,69,7,20,14,13,15,...,23,42,7,21,2,7,88,50,1,1
4,86,46,13,26,84,10,18,10,2,10,...,29,35,84,113,7,7,68,60,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,100,152,0,36,99,6,29,10,7,3,...,1,2,64,46,1,3,148,167,0,0
362,81,141,0,51,104,3,17,6,6,0,...,6,8,95,98,0,1,120,160,0,0
363,125,78,0,40,36,2,14,1,1,5,...,24,33,131,147,3,2,96,81,0,0
364,125,130,0,23,50,7,16,11,14,2,...,30,25,103,91,6,7,107,146,0,0


In [18]:
# 입력과 출력 분리
X = df.drop("Rainfall", axis=1)  # 특징 데이터
y = df["Rainfall"]
print("입력 데이터 크기:", X.shape)
print("출력 데이터 크기:", y.shape)


입력 데이터 크기: (328, 19)
출력 데이터 크기: (328,)


In [20]:

# -----------------------------
# 3) 훈련 데이터 / 테스트 데이터 분리
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=12, shuffle=True
)
print("훈련 데이터 크기:", X_train.shape, y_train.shape)
print("테스트 데이터 크기:", X_test.shape, y_test.shape)

훈련 데이터 크기: (262, 19) (262,)
테스트 데이터 크기: (66, 19) (66,)


In [21]:

# -----------------------------
# 4) 모델 객체 생성
# -----------------------------
# 의사결정나무 회귀 모델
dt = DecisionTreeRegressor(random_state=12)

# 랜덤포레스트 회귀 모델
rf = RandomForestRegressor(random_state=12)

# 선형 회귀 모델
lr = LinearRegression()

In [22]:
# 각 모델 객체 훈련
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

In [23]:
# 각 모델 예측
dt_y_pred = dt.predict(X_test)
rf_y_pred = rf.predict(X_test)
lr_y_pred = lr.predict(X_test)

In [24]:
# 각 모델 성능 평가
dt_mse = mean_squared_error(y_test, dt_y_pred)
dt_r2 = r2_score(y_test, dt_y_pred)

rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

lr_mse = mean_squared_error(y_test, lr_y_pred)
lr_r2 = r2_score(y_test, lr_y_pred)

In [26]:
# -----------------------------
# 8) MSE(제곱근 평균 제곱 오차) 계산
# -----------------------------
print("\n[MSE 결과]")
print("Decision Tree :", dt_mse)
print("Random Forest :", rf_mse)
print("Linear Regression :", lr_mse)

# -----------------------------
# 9) R-squared(결정 계수) 계산
# -----------------------------
print("\n[R² 결과]")
print("Decision Tree :", dt_r2)
print("Random Forest :", rf_r2)
print("Linear Regression :", lr_r2)


[MSE 결과]
Decision Tree : 49.03030303030303
Random Forest : 28.148919696969696
Linear Regression : 32.76514327205461

[R² 결과]
Decision Tree : 0.46240976228592146
Random Forest : 0.6913626166671701
Linear Regression : 0.6407482705241846
