# Предобработка датасета для регрессии

In [1]:
import pandas as pd

## Загрузка датасета

In [28]:
path_to_file = "../datasets/regression/metro_interstate_traffic_volume.csv"
df = pd.read_csv(path_to_file, sep=',')

In [29]:
df

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918
...,...,...,...,...,...,...,...,...,...
48199,,283.45,0.0,0.0,75,Clouds,broken clouds,2018-09-30 19:00:00,3543
48200,,282.76,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 20:00:00,2781
48201,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2018-09-30 21:00:00,2159
48202,,282.09,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 22:00:00,1450


## Предобработка

### Пропуски в данных

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              48204 non-null  object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
 8   traffic_volume       48204 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 3.3+ MB


In [31]:
df.mean()

temp               281.205870
rain_1h              0.334264
snow_1h              0.000222
clouds_all          49.362231
traffic_volume    3259.818355
dtype: float64

Исходя из полученных результатов, **пропущенных или пустых значений нет**

### Перекодирование

Имеем следующие категориальные признаки:   
- holiday
- weather_main
- weather_description  

Посмотрим на множество значений каждого признака

In [32]:
print(set(df['holiday'].tolist()))
print(set(df['weather_main'].tolist()))
print(set(df['weather_description'].tolist()))

{'Thanksgiving Day', 'Christmas Day', 'Independence Day', 'New Years Day', 'Columbus Day', 'Veterans Day', 'Washingtons Birthday', 'None', 'Labor Day', 'State Fair', 'Martin Luther King Jr Day', 'Memorial Day'}
{'Drizzle', 'Fog', 'Clear', 'Thunderstorm', 'Snow', 'Squall', 'Rain', 'Haze', 'Clouds', 'Mist', 'Smoke'}
{'light shower snow', 'proximity thunderstorm with drizzle', 'haze', 'fog', 'light rain', 'light snow', 'heavy snow', 'sky is clear', 'thunderstorm with light rain', 'thunderstorm with heavy rain', 'shower drizzle', 'light intensity shower rain', 'very heavy rain', 'thunderstorm with light drizzle', 'proximity shower rain', 'snow', 'sleet', 'proximity thunderstorm with rain', 'few clouds', 'broken clouds', 'heavy intensity drizzle', 'thunderstorm with drizzle', 'proximity thunderstorm', 'freezing rain', 'overcast clouds', 'light intensity drizzle', 'thunderstorm with rain', 'thunderstorm', 'moderate rain', 'light rain and snow', 'scattered clouds', 'shower snow', 'Sky is Clea

Видно, что в столбце *holiday* есть данные **None**, однако это строка. Это означает, что при перекодировании появится новый признак *holiday_None*. Логичнее будет заменить такое значение на **NoneType**. 

In [34]:
df['holiday'][df['holiday'] == 'None'] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['holiday'][df['holiday'] == 'None'] = None


Сразу же сделаем столбец *date_time* типа **datetime**.

In [35]:
df['date_time'] = df['date_time'].astype('datetime64[ns]')

Используем **one-hot кодирование** для перекодирования категориальных признаков.

In [42]:
df = pd.get_dummies(df, prefix=['h', 'wm', 'wd'], columns=['holiday', 'weather_main', 'weather_description'])

In [43]:
df

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,date_time,traffic_volume,h_Christmas Day,h_Columbus Day,h_Independence Day,h_Labor Day,...,wd_sleet,wd_smoke,wd_snow,wd_thunderstorm,wd_thunderstorm with drizzle,wd_thunderstorm with heavy rain,wd_thunderstorm with light drizzle,wd_thunderstorm with light rain,wd_thunderstorm with rain,wd_very heavy rain
0,288.28,0.0,0.0,40,2012-10-02 09:00:00,5545,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,289.36,0.0,0.0,75,2012-10-02 10:00:00,4516,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,289.58,0.0,0.0,90,2012-10-02 11:00:00,4767,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,290.13,0.0,0.0,90,2012-10-02 12:00:00,5026,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,291.14,0.0,0.0,75,2012-10-02 13:00:00,4918,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48199,283.45,0.0,0.0,75,2018-09-30 19:00:00,3543,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48200,282.76,0.0,0.0,90,2018-09-30 20:00:00,2781,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48201,282.73,0.0,0.0,90,2018-09-30 21:00:00,2159,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48202,282.09,0.0,0.0,90,2018-09-30 22:00:00,1450,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 66 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   temp                                    48204 non-null  float64       
 1   rain_1h                                 48204 non-null  float64       
 2   snow_1h                                 48204 non-null  float64       
 3   clouds_all                              48204 non-null  int64         
 4   date_time                               48204 non-null  datetime64[ns]
 5   traffic_volume                          48204 non-null  int64         
 6   h_Christmas Day                         48204 non-null  uint8         
 7   h_Columbus Day                          48204 non-null  uint8         
 8   h_Independence Day                      48204 non-null  uint8         
 9   h_Labor Day                             48204 non-

## Сохранение итогового датасета

In [45]:
path_to_file = "../datasets/regression/metro_interstate_traffic_volume_preprocessed.csv"
df.to_csv(path_to_file)