In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# feature와 target 나누기
X = data[['a', 'b', 'c', 'd']]
y = data.drop(columns=['a', 'b', 'c', 'd'])

# train, test 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
model = MultiOutputRegressor(LinearRegression())
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 평가 (예시로 price의 MSE만 계산)
price_mse = mean_squared_error(y_test['price'], y_pred[:, -1])
print(f'Price Mean Squared Error: {price_mse}')

In [None]:
 data = pd.read_csv('/content/drive/MyDrive/FinalProject_data/eng/essential_data_without_encoding.csv')
 data

Unnamed: 0,business_district_type_name,business_district_name,service_industry_name,district_name,administrative_dong_name,business_district_change_index,business_district_change_index_name,service_industry_category,monthly_sales_amount,monthly_average_income_amount,...,opening_rate,number_of_openings,closing_rate,number_of_closures,franchise_stores,average_operating_months,average_closing_months,seoul_average_operating_months,seoul_average_closing_months,monthly_sales_per_store
0,골목상권,이북5도청사,한식음식점,종로구,평창동,HH,정체,음식점,316423585,4354652,...,0,0,0,0,1,97,59,96,50,31642358
1,골목상권,이북5도청사,한식음식점,종로구,평창동,LH,상권확장,음식점,298018986,4354652,...,0,0,0,0,1,95,59,96,50,29801899
2,골목상권,이북5도청사,한식음식점,종로구,평창동,LH,상권확장,음식점,390767034,4354652,...,9,1,0,0,1,91,59,96,50,35524276
3,골목상권,이북5도청사,한식음식점,종로구,평창동,LH,상권확장,음식점,355524163,4354652,...,0,0,10,1,1,89,59,96,51,35552416
4,골목상권,이북5도청사,한식음식점,종로구,평창동,LH,상권확장,음식점,357856685,4354652,...,0,0,0,0,1,92,60,95,51,35785668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219559,전통시장,창동골목시장,미용실,도봉구,창3동,HL,상권축소,미용,44690306,2171979,...,0,0,0,0,0,116,49,103,51,7448384
219560,전통시장,창동골목시장,미용실,도봉구,창3동,HL,상권축소,미용,45208910,2171979,...,0,0,0,0,0,120,50,104,52,7534818
219561,전통시장,창동골목시장,미용실,도봉구,창3동,HL,상권축소,미용,44357768,2171979,...,0,0,0,0,0,120,50,106,52,7392961
219562,전통시장,창동골목시장,미용실,도봉구,창3동,HL,상권축소,미용,49082278,2171979,...,17,1,0,0,0,122,50,107,52,8180380


In [None]:
data.columns.tolist()

['business_district_type_name',
 'business_district_name',
 'service_industry_name',
 'district_name',
 'administrative_dong_name',
 'business_district_change_index',
 'business_district_change_index_name',
 'service_industry_category',
 'monthly_sales_amount',
 'monthly_average_income_amount',
 'total_expenditure_amount',
 'total_floating_population',
 'total_households',
 'apartment_households',
 'apartment_average_area',
 'apartment_average_price',
 'total_working_population',
 'total_attraction_facilities',
 'government_offices',
 'banks',
 'general_hospitals',
 'hospitals',
 'pharmacies',
 'kindergartens',
 'elementary_schools',
 'middle_schools',
 'high_schools',
 'universities',
 'department_stores',
 'supermarkets',
 'theaters',
 'accommodation_facilities',
 'airports',
 'train_stations',
 'bus_terminals',
 'subway_stations',
 'bus_stops',
 'total_stores',
 'opening_rate',
 'number_of_openings',
 'closing_rate',
 'number_of_closures',
 'franchise_stores',
 'average_operating_mo

In [None]:
# 종속 변수와 독립 변수 분리
y = data['monthly_sales_per_store']
X = data.drop(columns=['monthly_sales_per_store'])

# 범주형 변수와 수치형 변수 분리
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['number']).columns

In [None]:
# 전처리 파이프라인 설정
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [None]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 다중 선형 회귀 (Multiple Linear Regression)

In [None]:
# 모델 파이프라인 설정
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# 모델 학습
model.fit(X_train, y_train)

In [None]:
# 회귀 계수 추출
regressor = model.named_steps['regressor']
coefs = regressor.coef_

# 변수명 추출
feature_names = numerical_features.tolist() + list(model.named_steps['preprocessor'].named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features))

In [None]:
# 계수와 변수명을 데이터프레임으로 정리
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefs})

# 계수 절댓값 기준으로 내림차순 정렬
coef_df['Absolute_Coefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='Absolute_Coefficient', ascending=False)


In [None]:
# 상위 15개 변수 출력
top_features_linear_regression = coef_df.head(15)
print(top_features_linear_regression)

                           Feature  Coefficient  Absolute_Coefficient
1    monthly_average_income_amount     4.880638              4.880638
3        total_floating_population    -3.811794              3.811794
8         total_working_population     0.031301              0.031301
7          apartment_average_price     0.010516              0.010516
4                 total_households    -0.005733              0.005733
0             monthly_sales_amount     0.002578              0.002578
2         total_expenditure_amount    -0.002069              0.002069
29                    total_stores    -0.001037              0.001037
35        average_operating_months     0.000232              0.000232
9      total_attraction_facilities     0.000210              0.000210
36          average_closing_months     0.000086              0.000086
30                    opening_rate    -0.000085              0.000085
32                    closing_rate    -0.000053              0.000053
14                  

['Feature', 'Coefficient', 'Absolute_Coefficient']

# 랜덤 포레스트 회귀 (Random Forest Regressor)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# 모델 파이프라인 설정
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# 모델 학습
model.fit(X_train, y_train)


In [None]:
# 특성 중요도 추출
regressor = model.named_steps['regressor']
importances = regressor.feature_importances_


In [None]:
# 변수명 추출
feature_names = numerical_features.tolist() + list(model.named_steps['preprocessor'].named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features))

In [None]:
# 중요도와 변수명을 데이터프레임으로 정리
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# 중요도 기준으로 내림차순 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# 상위 8개 변수 출력
top_features_random_forest = importance_df.head(8)
print(top_features_random_forest)

# 그라디언트 부스팅 회귀 (Gradient Boosting Regressor)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# 모델 파이프라인 설정
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# 모델 학습
model.fit(X_train, y_train)

In [None]:
# 특성 중요도 추출
regressor = model.named_steps['regressor']
importances = regressor.feature_importances_

# 변수명 추출
feature_names = numerical_features.tolist() + list(model.named_steps['preprocessor'].named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features))


In [None]:
# 중요도와 변수명을 데이터프레임으로 정리
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# 중요도 기준으로 내림차순 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)


In [None]:
# 상위 8개 변수 출력
top_features_gradient_boosting = importance_df.head(8)
print(top_features_gradient_boosting)