In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# 예시 연도별 데이터 생성
np.random.seed(0)
years = pd.date_range(start='2020-01-01', periods=3, freq='Y')
data_yearly = {
    'ChildPopulation': np.random.randint(1000, 5000, size=3),
    'PopulationDensity': np.random.randint(100, 500, size=3),
    'AverageIncome': np.random.randint(30000, 100000, size=3),
    'GDP': np.random.randint(100000, 500000, size=3),
    'NurserySchools': np.random.randint(5, 20, size=3),
    'ParentEducation': np.random.randint(1, 3, size=3),
    'ObesityRate': np.random.rand(3),
    'SmokingRate': np.random.rand(3),
    'MentalIllnessRate': np.random.rand(3),
    'StressRate': np.random.rand(3),
    'HealthPerceptionRate': np.random.rand(3),
    'HealthcareWorkersPerCapita': np.random.rand(3),
    'VaccinationRate': np.random.rand(3),
    'HealthcareAccess': np.random.rand(3),
    'EnvironmentSatisfaction': np.random.rand(3),
}
df_yearly = pd.DataFrame(data_yearly, index=years)

# 연도별 데이터를 분기별로 확장
df_quarterly = df_yearly.resample('Q').ffill()

# 질병률 데이터 생성 (월별 데이터를 분기별로 변환)
disease_rate_monthly = np.random.rand(36) * 10 + 20
df_disease_rate = pd.DataFrame(disease_rate_monthly, index=pd.date_range(start='2020-01-01', periods=36, freq='M'), columns=['DiseaseRate'])
df_disease_rate_quarterly = df_disease_rate.resample('Q').mean()

# 분기별 독립변수와 종속변수 결합
df_quarterly['DiseaseRate'] = df_disease_rate_quarterly['DiseaseRate']

# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_quarterly.drop(columns=['DiseaseRate']))
y = df_quarterly['DiseaseRate']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

# 모델 설정 및 하이퍼파라미터 튜닝
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10]
}
rf_model = RandomForestRegressor(random_state=0)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# 최적 모델로 예측 수행
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 모델 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# 다음 분기 예측
next_quarter_features = X_scaled[-1].reshape(1, -1)  # 마지막 분기의 데이터를 기반으로 다음 분기 예측
next_quarter_prediction = best_model.predict(next_quarter_features)
print("Next Quarter Disease Rate Prediction:", next_quarter_prediction)



  years = pd.date_range(start='2020-01-01', periods=3, freq='Y')
  df_quarterly = df_yearly.resample('Q').ffill()
  df_disease_rate = pd.DataFrame(disease_rate_monthly, index=pd.date_range(start='2020-01-01', periods=36, freq='M'), columns=['DiseaseRate'])
  df_disease_rate_quarterly = df_disease_rate.resample('Q').mean()


Mean Squared Error: 5.6344646406268595
R-squared: -92.16498410681034
Next Quarter Disease Rate Prediction: [22.88721982]


