In [1]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('../data/weather_features_train.csv')
df.tail()

Unnamed: 0,dt_iso,city_name,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,rain_3h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
178391,2018-12-31 19:00:00+01:00,Seville,287.76,287.15,288.15,1028,54,3,30,0.0,0.0,0.0,0,800,clear,sky is clear,01n
178392,2018-12-31 20:00:00+01:00,Seville,285.76,285.15,286.15,1029,62,3,30,0.0,0.0,0.0,0,800,clear,sky is clear,01n
178393,2018-12-31 21:00:00+01:00,Seville,285.15,285.15,285.15,1028,58,4,50,0.0,0.0,0.0,0,800,clear,sky is clear,01n
178394,2018-12-31 22:00:00+01:00,Seville,284.15,284.15,284.15,1029,57,4,60,0.0,0.0,0.0,0,800,clear,sky is clear,01n
178395,2018-12-31 23:00:00+01:00,Seville,283.97,282.15,285.15,1029,70,3,50,0.0,0.0,0.0,0,800,clear,sky is clear,01n


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178396 entries, 0 to 178395
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   dt_iso               178396 non-null  object 
 1   city_name            178396 non-null  object 
 2   temp                 178396 non-null  float64
 3   temp_min             178396 non-null  float64
 4   temp_max             178396 non-null  float64
 5   pressure             178396 non-null  int64  
 6   humidity             178396 non-null  int64  
 7   wind_speed           178396 non-null  int64  
 8   wind_deg             178396 non-null  int64  
 9   rain_1h              178396 non-null  float64
 10  rain_3h              178396 non-null  float64
 11  snow_3h              178396 non-null  float64
 12  clouds_all           178396 non-null  int64  
 13  weather_id           178396 non-null  int64  
 14  weather_main         178396 non-null  object 
 15  weather_descripti

In [4]:
df['city_name'].value_counts()

Madrid        36267
Bilbao        35951
Seville       35557
 Barcelona    35476
Valencia      35145
Name: city_name, dtype: int64

In [5]:
df_Madrid = df.query('city_name == "Madrid"').copy()
df_Madrid.tail()

Unnamed: 0,dt_iso,city_name,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,rain_3h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
71407,2018-12-31 19:00:00+01:00,Madrid,283.56,282.15,285.15,1030,88,1,280,0.0,0.0,0.0,0,800,clear,sky is clear,01n
71408,2018-12-31 20:00:00+01:00,Madrid,280.12,278.15,281.15,1031,52,1,260,0.0,0.0,0.0,0,800,clear,sky is clear,01n
71409,2018-12-31 21:00:00+01:00,Madrid,278.15,278.15,278.15,1030,65,1,340,0.0,0.0,0.0,0,800,clear,sky is clear,01n
71410,2018-12-31 22:00:00+01:00,Madrid,276.57,276.15,277.15,1031,69,2,340,0.0,0.0,0.0,0,800,clear,sky is clear,01n
71411,2018-12-31 23:00:00+01:00,Madrid,275.15,275.15,275.15,1031,74,1,360,0.0,0.0,0.0,0,800,clear,sky is clear,01n


In [6]:
df_Madrid['dt_iso'] = pd.to_datetime(df_Madrid['dt_iso'], utc=True)
df_Madrid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36267 entries, 35145 to 71411
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   dt_iso               36267 non-null  datetime64[ns, UTC]
 1   city_name            36267 non-null  object             
 2   temp                 36267 non-null  float64            
 3   temp_min             36267 non-null  float64            
 4   temp_max             36267 non-null  float64            
 5   pressure             36267 non-null  int64              
 6   humidity             36267 non-null  int64              
 7   wind_speed           36267 non-null  int64              
 8   wind_deg             36267 non-null  int64              
 9   rain_1h              36267 non-null  float64            
 10  rain_3h              36267 non-null  float64            
 11  snow_3h              36267 non-null  float64            
 12  clouds_all    

In [7]:
df_Madrid['weather_main'].value_counts()

clear           20356
clouds          10643
rain             2657
mist              938
fog               708
drizzle           637
thunderstorm      222
snow               88
haze               18
Name: weather_main, dtype: int64

In [8]:
le_main = LabelEncoder()
df_Madrid['weather_main'] = le_main.fit_transform(df_Madrid['weather_main'])

In [9]:
df_Madrid['weather_main'].value_counts()

0    20356
1    10643
6     2657
5      938
3      708
2      637
8      222
7       88
4       18
Name: weather_main, dtype: int64

In [10]:
le_desc = LabelEncoder()
df_Madrid['weather_description'] = le_desc.fit_transform(df_Madrid['weather_description'])
le_icon = LabelEncoder()
df_Madrid['weather_icon'] = le_icon.fit_transform(df_Madrid['weather_icon'])
df_Madrid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36267 entries, 35145 to 71411
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   dt_iso               36267 non-null  datetime64[ns, UTC]
 1   city_name            36267 non-null  object             
 2   temp                 36267 non-null  float64            
 3   temp_min             36267 non-null  float64            
 4   temp_max             36267 non-null  float64            
 5   pressure             36267 non-null  int64              
 6   humidity             36267 non-null  int64              
 7   wind_speed           36267 non-null  int64              
 8   wind_deg             36267 non-null  int64              
 9   rain_1h              36267 non-null  float64            
 10  rain_3h              36267 non-null  float64            
 11  snow_3h              36267 non-null  float64            
 12  clouds_all    

In [11]:
df_Madrid.set_index('dt_iso', inplace=True)
df_Madrid

Unnamed: 0_level_0,city_name,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,rain_3h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
dt_iso,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2014-12-31 23:00:00+00:00,Madrid,267.325,267.325,267.325,971,63,1,309,0.0,0.0,0.0,0,800,0,25,2
2015-01-01 00:00:00+00:00,Madrid,267.325,267.325,267.325,971,63,1,309,0.0,0.0,0.0,0,800,0,25,2
2015-01-01 01:00:00+00:00,Madrid,266.186,266.186,266.186,971,64,1,273,0.0,0.0,0.0,0,800,0,25,2
2015-01-01 02:00:00+00:00,Madrid,266.186,266.186,266.186,971,64,1,273,0.0,0.0,0.0,0,800,0,25,2
2015-01-01 03:00:00+00:00,Madrid,266.186,266.186,266.186,971,64,1,273,0.0,0.0,0.0,0,800,0,25,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31 18:00:00+00:00,Madrid,283.560,282.150,285.150,1030,88,1,280,0.0,0.0,0.0,0,800,0,25,2
2018-12-31 19:00:00+00:00,Madrid,280.120,278.150,281.150,1031,52,1,260,0.0,0.0,0.0,0,800,0,25,2
2018-12-31 20:00:00+00:00,Madrid,278.150,278.150,278.150,1030,65,1,340,0.0,0.0,0.0,0,800,0,25,2
2018-12-31 21:00:00+00:00,Madrid,276.570,276.150,277.150,1031,69,2,340,0.0,0.0,0.0,0,800,0,25,2


In [12]:
df_Madrid = df_Madrid[~df_Madrid.index.duplicated()]

In [13]:
df_Madrid['temp'] = df_Madrid['temp'].shift(periods=-3, freq="h")
df_Madrid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,city_name,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,rain_3h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
dt_iso,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2014-12-31 23:00:00+00:00,Madrid,266.186,267.325,267.325,971,63,1,309,0.0,0.0,0.0,0,800,0,25,2
2015-01-01 00:00:00+00:00,Madrid,266.186,267.325,267.325,971,63,1,309,0.0,0.0,0.0,0,800,0,25,2
2015-01-01 01:00:00+00:00,Madrid,265.442,266.186,266.186,971,64,1,273,0.0,0.0,0.0,0,800,0,25,2
2015-01-01 02:00:00+00:00,Madrid,265.442,266.186,266.186,971,64,1,273,0.0,0.0,0.0,0,800,0,25,2
2015-01-01 03:00:00+00:00,Madrid,265.442,266.186,266.186,971,64,1,273,0.0,0.0,0.0,0,800,0,25,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31 18:00:00+00:00,Madrid,276.570,282.150,285.150,1030,88,1,280,0.0,0.0,0.0,0,800,0,25,2
2018-12-31 19:00:00+00:00,Madrid,275.150,278.150,281.150,1031,52,1,260,0.0,0.0,0.0,0,800,0,25,2
2018-12-31 20:00:00+00:00,Madrid,,278.150,278.150,1030,65,1,340,0.0,0.0,0.0,0,800,0,25,2
2018-12-31 21:00:00+00:00,Madrid,,276.150,277.150,1031,69,2,340,0.0,0.0,0.0,0,800,0,25,2


In [14]:
df_Madrid['temp'] = df_Madrid['temp'].fillna(method='ffill')
df_Madrid.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35064 entries, 2014-12-31 23:00:00+00:00 to 2018-12-31 22:00:00+00:00
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   city_name            35064 non-null  object 
 1   temp                 35064 non-null  float64
 2   temp_min             35064 non-null  float64
 3   temp_max             35064 non-null  float64
 4   pressure             35064 non-null  int64  
 5   humidity             35064 non-null  int64  
 6   wind_speed           35064 non-null  int64  
 7   wind_deg             35064 non-null  int64  
 8   rain_1h              35064 non-null  float64
 9   rain_3h              35064 non-null  float64
 10  snow_3h              35064 non-null  float64
 11  clouds_all           35064 non-null  int64  
 12  weather_id           35064 non-null  int64  
 13  weather_main         35064 non-null  int64  
 14  weather_description  35064 non-null  in

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
target = 'temp'
features = df_Madrid.columns.drop([target, 'city_name'])

In [16]:
X = df_Madrid[features].values
y = df_Madrid[target].values

In [17]:
lr = LinearRegression()


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
lr.fit(X_train, y_train)

LinearRegression()

In [20]:
y_pred = lr.predict(X_test)

In [21]:
y_pred

array([291.41355563, 292.7489961 , 273.73405881, ..., 276.86062245,
       286.45290585, 282.22852647])

In [22]:
y_test

array([291.3795, 286.225 , 273.8   , ..., 274.42  , 289.72  , 278.22  ])

In [23]:
mean_squared_error(y_pred, y_test)**.5

3.193712952717629