<a href="https://colab.research.google.com/github/noeyhey/HUFS_StatisticalModeling/blob/main/StatisticalModeling_0402.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 비정형데이터마이닝
#### 250402 5주차 실습
#### 자전거 수요 (count) 예측

#### Kaggle API 설정 (Colab에서만 실행 필요)

In [None]:
!pip install -q kaggle

##### kaggle api 설정

In [None]:
# kaggle.json 업로드
from google.colab import files
files.upload() # 여기서 kaggle.json 파일 업로드

# kaggle API 설정
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

TypeError: 'NoneType' object is not subscriptable

##### Join Competition (https://www.kaggle.com/c/bike-sharing-demand)
##### 데이터 다운로드

In [None]:
!kaggle competitions download -c bike-sharing-demand
!unzip bike-sharing-demand.zip

#### 라이브러리 불러오기

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factorfrom sklearn.decomposition import PCA
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_squared_error, r2_score

#### 데이터셋 불러오기

In [None]:
df = pd.read_csv('train.csv', parse_dates=['datetime'])
print(f"Shape: {df.shape}")
print(df.head())
print(df.info())
print(df.isnull().sum())
print(df.describe())

##### 데이터셋 탐색 (1)

In [None]:
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year

##### 데이터셋 탐색 (2) - 상관분석

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

##### 데이터셋 탐색 (3) - 사용할 feature 선택, 스케일링

In [None]:
features = df.drop(columns=['datetime', 'casual', 'registered',
'count']).columns
X = df[features]
y = df['count']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

##### 데이터셋 탐색 (4) - VIF 계산

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled, i) for i inrange(X_scaled.shape[1])]
print(vif_data)

#### 포아송회귀 모델링

In [None]:
# 학습/테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Poisson 회귀 모델
model = PoissonRegressor(alpha=1e-4, max_iter=1000)
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 평가
print("[전체 피처 사용]")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

#### PCA + 포아송회귀 모델링

In [None]:
# PCA 전체 컴포넌트 적용
pca_full = PCA()
pca_full.fit(X_scaled)

# 설명된 분산 비율
explained_variance = pca_full.explained_variance_ratio_
cumulative_variance = explained_variance.cumsum()

# Scree plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', label='Individual')
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='s', linestyle='--', label='Cumulative')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot for PCA')
plt.axhline(y=0.9, color='r', linestyle='--', label='90% Variance Threshold')plt.legend()
plt.grid(True)
plt.show()

In [None]:
# PCA (7개 주성분만 사용)
pca = PCA(n_components=7)
X_pca = pca.fit_transform(X_scaled)
# /
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
#
model = PoissonRegressor(alpha=1e-4, max_iter=1000)
model.fit(X_train, y_train)
#
y_pred = model.predict(X_test)
#
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))