# Weather Data

### About the Data

This data was purchased from https://openweathermap.org/

Information about how it was gathered can be found at https://openweathermap.org/accuracy-and-quality

In [6]:
def print_nulls(df):
    
    return df.isnull().sum()*100/df.isnull().isnull().count()

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
weather = pd.read_csv(r'data/weather/Philadelphia_Historical_Weather_Hourly.csv')

In [3]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401336 entries, 0 to 401335
Data columns (total 25 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   dt                   401336 non-null  int64  
 1   dt_iso               401336 non-null  object 
 2   timezone             401336 non-null  int64  
 3   city_name            401336 non-null  object 
 4   lat                  401336 non-null  float64
 5   lon                  401336 non-null  float64
 6   temp                 401336 non-null  float64
 7   feels_like           401336 non-null  float64
 8   temp_min             401336 non-null  float64
 9   temp_max             401336 non-null  float64
 10  pressure             401336 non-null  int64  
 11  sea_level            0 non-null       float64
 12  grnd_level           0 non-null       float64
 13  humidity             401336 non-null  int64  
 14  wind_speed           401336 non-null  float64
 15  wind_deg         

In [20]:
# quickly add a year variable to sort down to the timeframe I need
weather['year'] = weather['dt_iso'].apply(lambda x: x.split('-')[0])
relevant = weather[weather['year'].isin(['2015', '2016', '2017', '2018', '2019', '2020'])].copy()

In [21]:
relevant.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53199 entries, 348137 to 401335
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   dt                   53199 non-null  int64  
 1   dt_iso               53199 non-null  object 
 2   timezone             53199 non-null  int64  
 3   city_name            53199 non-null  object 
 4   lat                  53199 non-null  float64
 5   lon                  53199 non-null  float64
 6   temp                 53199 non-null  float64
 7   feels_like           53199 non-null  float64
 8   temp_min             53199 non-null  float64
 9   temp_max             53199 non-null  float64
 10  pressure             53199 non-null  int64  
 11  sea_level            0 non-null      float64
 12  grnd_level           0 non-null      float64
 13  humidity             53199 non-null  int64  
 14  wind_speed           53199 non-null  float64
 15  wind_deg             53199 non

In [22]:
relevant['dt'] = pd.to_datetime(relevant['dt_iso'].apply(lambda x: x[:-10]))

In [23]:
relevant['dt']

348137   2015-01-01 00:00:00
348138   2015-01-01 01:00:00
348139   2015-01-01 02:00:00
348140   2015-01-01 03:00:00
348141   2015-01-01 04:00:00
                 ...        
401331   2020-10-12 20:00:00
401332   2020-10-12 21:00:00
401333   2020-10-12 22:00:00
401334   2020-10-12 23:00:00
401335   2020-10-12 23:00:00
Name: dt, Length: 53199, dtype: datetime64[ns]

In [26]:
cols = ['rain_1h', 'snow_1h', 'rain_3h', 'snow_3h']

relevant.loc[:, cols] = relevant.loc[:, cols].fillna(0)

In [27]:
kept = relevant[['dt', 'temp', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'clouds_all',
       'weather_main',]].copy()

In [28]:
kept['percip_1h'] = relevant['rain_1h'] + relevant['snow_1h']
kept['percip_3h'] = relevant['rain_3h'] + relevant['snow_3h']

In [29]:
print_nulls(kept)

dt              0.0
temp            0.0
feels_like      0.0
pressure        0.0
humidity        0.0
wind_speed      0.0
clouds_all      0.0
weather_main    0.0
percip_1h       0.0
percip_3h       0.0
dtype: float64

In [32]:
kept['weather_main'].unique()

array(['Clear', 'Clouds', 'Rain', 'Mist', 'Drizzle', 'Fog', 'Snow',
       'Haze', 'Thunderstorm', 'Squall'], dtype=object)

In [30]:
kept.to_csv(r'data/weather/Philadelphia_Weather_Hourly_2015-2020.csv', index=False)