In [1]:
# 8회 실기 기출분석 및 문제풀이 - 2유형
# ✅
# 제 2유형 기출 문제파악
# 문제 : 지하철 탑승인원수를 예측하기
# 회귀or분류 : 회귀 문제
# 성능지표 : MAE(Mean Absolute Error)
# 기출 데이터셋 주요 정보
# 독립변수(X) : 11개(날씨, 날짜 등)
# 종속변수(Y) : 지하철 탑승인원수(정수형)
# 데이터셋 : train 약 12,000개 / test 약 2,000개
# 결측치 : 없음
# 특이사항
# name변수(지하철역명) 삭제시 감점처리 되는 경우, 30점 감점(10/40점)

In [2]:
 ############################# 복사 영역 ################################
 # 데이터 생성(수정금지) 
import numpy as np
import pandas as pd
 # 랜덤 시드 설정
np.random.seed(42)
 # 데이터 수
n = 1000
 # 독립변수 생성 (날씨, 날짜 등)
data = {
    'temperature': np.random.uniform(5, 35, n),  # 기온 (5도 ~ 35도 사이)
    'humidity': np.random.uniform(20, 90, n),  # 습도 (20% ~ 90% 사이)
    'precipitation': np.random.choice([0, 1], n, p=[0.7, 0.3]),  # 강수 여부 (0: 없음, 1: 있음)
    'wind_speed': np.random.uniform(0, 10, n),  # 풍속 (0 ~ 10 m/s)
    'day_of_week': np.random.choice(range(7), n),  # 요일 (0: 일요일 ~ 6: 토요일)
    'is_holiday': np.random.choice([0, 1], n, p=[0.8, 0.2]),  # 공휴일 여부 (0: 평일, 1: 공휴일)
    'is_weekend': np.random.choice([0, 1], n, p=[0.7, 0.3]),  # 주말 여부 (0: 평일, 1: 주말)
    'hour': np.random.choice(range(24), n),  # 시간대 (0시 ~ 23시)
    'school_vacation': np.random.choice([0, 1], n, p=[0.8, 0.2]),  # 방학 여부 (0: 아님, 1: 방학)
    'special_event': np.random.choice([0, 1], n, p=[0.95, 0.05]),  # 특별 행사 여부 (0: 없음, 1: 있음)
 }
 # 독립변수로 탑승 인원수 생성 (임의 공식 사용)
data['passenger_count'] = (300 
                           + 5 * data['temperature'] 
                           - 2 * data['humidity'] 
                           + 50 * data['precipitation'] 
                           - 10 * data['wind_speed'] 
                           - 20 * data['is_holiday'] 
                           - 15 * data['is_weekend'] 
                           + 3 * data['hour'] 
                           - 50 * data['school_vacation'] 
                           + 80 * data['special_event'] 
                           + np.random.normal(0, 30, n))  # 노이즈 추가
# 종속변수는 정수형으로 변환
data['passenger_count'] = np.round(data['passenger_count']).astype(int)
 # 데이터프레임으로 변환
df = pd.DataFrame(data)
 # 독립변수 중에서 소수점이 있는 변수는 소수점 첫째 자리까지 반올림
df['temperature'] = np.round(df['temperature'], 1)
df['humidity'] = np.round(df['humidity'], 1)
df['wind_speed'] = np.round(df['wind_speed'], 1)
 # 지하철 2호선 역 이름 추가
station_names = [
    '시청', '을지로입구', '을지로3가', '을지로4가', '동대문역사문화공원', 
    '신당', '상왕십리', '왕십리', '한양대', '뚝섬', 
    '성수', '건대입구', '구의', '강변', '잠실나루', '잠실', '잠실새내', 
    '종합운동장', '삼성', '선릉', '역삼', '강남', 
    '교대', '서초', '방배', '사당', '낙성대', '서울대입구', '봉천', 
    '신림', '신대방', '구로디지털단지', '대림', '신도림', 
    '문래', '영등포구청', '당산', '합정', '홍대입구', '신촌', '이대', '아현', '충정로'
 ]
 # 1000개 중 2호선의 역이 43개이므로 임의로 순환하며 채우기
name_variable = [station_names[i % len(station_names)] for i in range(1000)]
 # 'name' 변수 추가
df['name'] = name_variable
 # 범주형 데이터를 카테고리형으로 변환
categorical_columns = ['precipitation', 'day_of_week', 'is_holiday', 
                       'is_weekend', 'school_vacation', 'special_event', 'name']
 # 카테고리형으로 변환
df[categorical_columns] = df[categorical_columns].astype('category')
 # Train, Test 셋 나누기
train = df.iloc[:700]  # 초기 700개를 train으로
test = df.iloc[700:]   # 마지막 300개를 test로
y_test = test['passenger_count'] # 실제 시험에서는 주어지지 않습니다. 
test = test.drop(['passenger_count'], axis=1)
 ############################# 복사 영역 ################################
print(train.head(20))

    temperature  humidity precipitation  wind_speed day_of_week is_holiday  \
0          16.2      33.0             0         6.7           1          1   
1          33.5      57.9             0         8.0           4          0   
2          27.0      81.1             1         2.5           4          0   
3          23.0      71.3             0         6.2           2          1   
4           9.7      76.5             0         5.7           6          0   
5           9.7      66.1             1         8.3           3          0   
6           6.7      68.5             0         9.1           4          0   
7          31.0      79.4             1         0.1           3          1   
8          23.0      37.5             0         6.7           0          0   
9          26.2      54.3             0         0.5           5          0   
10          5.6      35.5             0         5.5           6          0   
11         34.1      89.1             0         2.9           3 

In [3]:
print(train.head(20))

    temperature  humidity precipitation  wind_speed day_of_week is_holiday  \
0          16.2      33.0             0         6.7           1          1   
1          33.5      57.9             0         8.0           4          0   
2          27.0      81.1             1         2.5           4          0   
3          23.0      71.3             0         6.2           2          1   
4           9.7      76.5             0         5.7           6          0   
5           9.7      66.1             1         8.3           3          0   
6           6.7      68.5             0         9.1           4          0   
7          31.0      79.4             1         0.1           3          1   
8          23.0      37.5             0         6.7           0          0   
9          26.2      54.3             0         0.5           5          0   
10          5.6      35.5             0         5.5           6          0   
11         34.1      89.1             0         2.9           3 

In [4]:
test.head(20)

Unnamed: 0,temperature,humidity,precipitation,wind_speed,day_of_week,is_holiday,is_weekend,hour,school_vacation,special_event,name
700,21.0,87.6,0,6.1,0,0,0,18,0,0,구의
701,6.6,50.3,1,6.8,4,0,1,8,0,0,강변
702,15.1,41.8,0,3.2,3,1,1,12,0,0,잠실나루
703,9.0,55.4,0,8.5,0,0,0,7,0,0,잠실
704,6.9,50.8,1,9.5,1,0,1,22,1,0,잠실새내
705,34.7,27.4,1,8.8,2,0,0,12,0,0,종합운동장
706,14.7,64.9,0,7.4,5,1,0,14,0,0,삼성
707,29.3,35.1,0,2.8,0,0,0,10,0,0,선릉
708,12.6,63.4,0,2.8,0,0,0,11,0,0,역삼
709,25.4,65.5,0,9.6,1,0,1,13,1,0,강남


In [5]:
print(train.isnull().sum())
print(test.isnull().sum())

temperature        0
humidity           0
precipitation      0
wind_speed         0
day_of_week        0
is_holiday         0
is_weekend         0
hour               0
school_vacation    0
special_event      0
passenger_count    0
name               0
dtype: int64
temperature        0
humidity           0
precipitation      0
wind_speed         0
day_of_week        0
is_holiday         0
is_weekend         0
hour               0
school_vacation    0
special_event      0
name               0
dtype: int64


In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['name'] = le.fit_transform(train['name'])
test['name'] = le.transform(test['name'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['name'] = le.fit_transform(train['name'])


In [11]:
train.head(20)


Unnamed: 0,temperature,humidity,precipitation,wind_speed,day_of_week,is_holiday,is_weekend,hour,school_vacation,special_event,passenger_count,name
0,16.2,33.0,0,6.7,1,1,0,1,0,0,215,21
1,33.5,57.9,0,8.0,4,0,0,22,0,0,328,33
2,27.0,81.1,1,2.5,4,0,1,21,0,0,353,31
3,23.0,71.3,0,6.2,2,1,0,4,0,0,191,32
4,9.7,76.5,0,5.7,6,0,0,3,0,0,134,9
5,9.7,66.1,1,8.3,3,0,1,3,1,0,139,22
6,6.7,68.5,0,9.1,4,0,1,5,0,0,61,16
7,31.0,79.4,1,0.1,3,1,1,19,0,0,402,30
8,23.0,37.5,0,6.7,0,0,1,20,0,0,311,40
9,26.2,54.3,0,0.5,5,0,0,12,0,0,323,10


In [9]:
test.head(20)

Unnamed: 0,temperature,humidity,precipitation,wind_speed,day_of_week,is_holiday,is_weekend,hour,school_vacation,special_event,name
700,21.0,87.6,0,6.1,0,0,0,18,0,0,5
701,6.6,50.3,1,6.8,4,0,1,8,0,0,1
702,15.1,41.8,0,3.2,3,1,1,12,0,0,36
703,9.0,55.4,0,8.5,0,0,0,7,0,0,35
704,6.9,50.8,1,9.5,1,0,1,22,1,0,37
705,34.7,27.4,1,8.8,2,0,0,12,0,0,38
706,14.7,64.9,0,7.4,5,1,0,14,0,0,15
707,29.3,35.1,0,2.8,0,0,0,10,0,0,19
708,12.6,63.4,0,2.8,0,0,0,11,0,0,28
709,25.4,65.5,0,9.6,1,0,1,13,1,0,0


In [12]:
from sklearn.model_selection import train_test_split
x = train.drop(columns='passenger_count') 
y = train['passenger_count']

x_train, x_val, y_train, y_val = train_test_split(x, y ,test_size=0.2, random_state=2024)
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(560, 11)
(560,)
(140, 11)
(140,)


In [13]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=2024)
model = rfr.fit(x_train,y_train)
pred = model.predict(x_val)


In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_val, pred)
mse = mean_squared_error(y_val, pred)
r2 = r2_score(y_val, pred)

print(mae, mse, r2)


33.56835714285714 1840.3944807142855 0.7367377839351402


In [15]:
test

Unnamed: 0,temperature,humidity,precipitation,wind_speed,day_of_week,is_holiday,is_weekend,hour,school_vacation,special_event,name
700,21.0,87.6,0,6.1,0,0,0,18,0,0,5
701,6.6,50.3,1,6.8,4,0,1,8,0,0,1
702,15.1,41.8,0,3.2,3,1,1,12,0,0,36
703,9.0,55.4,0,8.5,0,0,0,7,0,0,35
704,6.9,50.8,1,9.5,1,0,1,22,1,0,37
...,...,...,...,...,...,...,...,...,...,...,...
995,7.7,66.0,1,4.4,4,0,0,8,0,0,16
996,32.5,87.0,0,3.3,1,0,1,9,0,0,30
997,9.1,24.8,0,3.9,4,0,1,3,0,0,40
998,33.5,24.0,0,5.3,6,0,0,16,1,0,10


In [16]:
pred2 = model.predict(test)


In [17]:
mae = mean_absolute_error(y_test, pred2)
mse = mean_squared_error(y_test, pred2)
r2 = r2_score(y_test, pred2)

print(mae, mse, r2)

33.6119 1806.023221 0.7347072892220816
