In [1]:
import pandas as pd

In [2]:
def load_csv(path):
    df = pd.read_csv(
        path, header=6, sep=';', index_col=[0],
        parse_dates=[0], names=new_columns, dayfirst=True
    ).sort_index(ascending=True)
    return df

In [3]:
def new_time_features(df, key_column):
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['rolling_mean'] = df[key_column].shift(7).rolling(7).mean()
    df['lag_7'] = df[key_column].shift(7)
    df['lag_15'] = df[key_column].shift(15)
    df['lag_30'] = df[key_column].shift(30)
    df['lag_90'] = df[key_column].shift(90)
    df['lag_180'] = df[key_column].shift(180)
    return df

In [4]:
new_columns = [
    'температура', 'температура2', 'атм_давл_на_ст', 'атм_давл_на_ур_мор', 'изм_атм_давл', 'отн_влаж',
    'напр_ветра', 'скор_ветра', 'макс_порыв_ветра1', 'макс_порыв_ветра2', 'облач', 'тек_погода',
    'прош_погода1', 'прош_погода2', 'мин_темп', 'макс_темп', 'облака', 'колич_обл', 'выс_обл',
    'облака1', 'облака2', 'дальн_вид', 'темп_точки_росы', 'колич_осадков', 'время_накопл_осадков',
    'поверхн_почвы', 'темп_почвы', 'поверхн_почвы2', 'выс_снега'
]

In [5]:
df_1 = load_csv('Datasets/weather/weather_2005-2010.csv')
df_2 = load_csv('Datasets/weather/weather_2010-2015.csv')
df_3 = load_csv('Datasets/weather/weather_2015-2020.csv')
df_4 = load_csv('Datasets/weather/weather_last.csv')

In [6]:
df_1 = df_1['температура'].to_frame().astype('float')
df_2 = df_2['температура'].to_frame().astype('float')
df_3 = df_3['температура'].to_frame().astype('float')
df_4 = df_4['температура'].to_frame().astype('float')

In [7]:
df_all = pd.concat([df_1, df_2, df_3, df_4])

In [8]:
df_all = df_all.resample('1D').max()

In [9]:
df_all = df_all.rename(columns={'температура':'temperature'})

In [10]:
df_all

Unnamed: 0,temperature
2005-02-01,-6.6
2005-02-02,-8.0
2005-02-03,-10.6
2005-02-04,-8.6
2005-02-05,-8.1
...,...
2021-08-30,22.8
2021-08-31,25.2
2021-09-01,19.9
2021-09-02,13.4


In [11]:
df_all.isna().sum()

temperature    2
dtype: int64

In [12]:
df_all[df_all['temperature'].isna()]

Unnamed: 0,temperature
2008-05-06,
2012-12-16,


In [13]:
df_all.loc['2008-05-04':'2008-05-08']

Unnamed: 0,temperature
2008-05-04,19.0
2008-05-05,21.1
2008-05-06,
2008-05-07,5.6
2008-05-08,11.6


In [14]:
df_all.loc['2012-12-14':'2012-12-18']

Unnamed: 0,temperature
2012-12-14,-8.2
2012-12-15,-13.1
2012-12-16,
2012-12-17,-15.0
2012-12-18,-15.0


In [15]:
df_all.loc['2008-05-06'] = df_all.loc['2008-05-06'].fillna(df_all.loc['2008-05-04':'2008-05-08'].mean())

In [16]:
df_all.loc['2008-05-04':'2008-05-08']

Unnamed: 0,temperature
2008-05-04,19.0
2008-05-05,21.1
2008-05-06,14.325
2008-05-07,5.6
2008-05-08,11.6


In [17]:
df_all.loc['2012-12-16'] = df_all.loc['2012-12-16'].fillna(df_all.loc['2012-12-14':'2012-12-18'].mean())

In [18]:
df_all.loc['2012-12-14':'2012-12-18']

Unnamed: 0,temperature
2012-12-14,-8.2
2012-12-15,-13.1
2012-12-16,-12.825
2012-12-17,-15.0
2012-12-18,-15.0


In [19]:
df_all.isna().sum()

temperature    0
dtype: int64

In [20]:
new_time_features(df_all, 'temperature')
df_all = df_all.dropna()

In [21]:
df_all

Unnamed: 0,temperature,month,day,rolling_mean,lag_7,lag_15,lag_30,lag_90,lag_180
2005-07-31,27.6,7,31,24.900000,26.2,28.2,17.2,8.9,-6.6
2005-08-01,26.1,8,1,25.414286,27.2,27.2,15.6,10.0,-8.0
2005-08-02,25.7,8,2,25.328571,23.2,23.6,21.3,16.0,-10.6
2005-08-03,23.8,8,3,25.128571,24.1,23.8,21.9,20.0,-8.6
2005-08-04,22.7,8,4,24.742857,23.3,25.5,23.2,20.2,-8.1
...,...,...,...,...,...,...,...,...,...
2021-08-30,22.8,8,30,24.271429,20.2,26.2,25.8,16.6,3.2
2021-08-31,25.2,8,31,22.728571,19.3,27.6,25.2,20.6,2.5
2021-09-01,19.9,9,1,21.100000,19.2,30.1,29.7,20.4,-1.7
2021-09-02,13.4,9,2,20.485714,19.3,30.6,22.8,22.9,-4.2


In [22]:
df_all.to_csv('Datasets/weather/weather.csv')