In [12]:
import pandas as pd

# Load data into a DataFrame
df = pd.read_csv('../data/weather/atl_weather.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20698 entries, 0 to 20697
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   station  20698 non-null  object 
 1   valid    20698 non-null  object 
 2   tmpf     20697 non-null  float64
 3   dwpf     20696 non-null  float64
 4   relh     20696 non-null  float64
 5   sknt     20686 non-null  float64
 6   vsby     20697 non-null  float64
dtypes: float64(5), object(2)
memory usage: 1.1+ MB


In [13]:
print(list(df))

['station', 'valid', 'tmpf', 'dwpf', 'relh', 'sknt', 'vsby']


In [14]:
df['valid'] = pd.to_datetime(df['valid'])

In [15]:
df.info()
# 20698 entries, 0 to 20697
# 7 columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20698 entries, 0 to 20697
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  20698 non-null  object        
 1   valid    20698 non-null  datetime64[ns]
 2   tmpf     20697 non-null  float64       
 3   dwpf     20696 non-null  float64       
 4   relh     20696 non-null  float64       
 5   sknt     20686 non-null  float64       
 6   vsby     20697 non-null  float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 1.1+ MB


## Daily Averages

In [16]:
# Group by day and calculate the mean for each variable
daily_avg = df.groupby(df['valid'].dt.date).agg({'tmpf': 'mean', 'dwpf': 'mean', 'relh': 'mean'}).reset_index()

# Round the mean values to the 2nd decimal place
daily_avg['tmpf'] = daily_avg['tmpf'].round(2)
daily_avg['dwpf'] = daily_avg['dwpf'].round(2)
daily_avg['relh'] = daily_avg['relh'].round(2)

# Rename the columns for clarity
daily_avg.columns = ['FlightDate', 'DailyAvgTemperature', 'DailyAvgDewPoint', 'DailyAvgRelativeHumidity']

daily_avg.info()
# 730 entries

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   FlightDate                730 non-null    object 
 1   DailyAvgTemperature       730 non-null    float64
 2   DailyAvgDewPoint          730 non-null    float64
 3   DailyAvgRelativeHumidity  730 non-null    float64
dtypes: float64(3), object(1)
memory usage: 22.9+ KB


In [17]:
daily_avg.head()

Unnamed: 0,FlightDate,DailyAvgTemperature,DailyAvgDewPoint,DailyAvgRelativeHumidity
0,2021-08-01,80.61,70.79,73.38
1,2021-08-02,79.32,64.76,63.65
2,2021-08-03,75.0,68.06,79.92
3,2021-08-04,74.63,65.17,74.24
4,2021-08-05,76.36,64.46,68.23


In [18]:
print(daily_avg.shape)

(730, 4)


In [19]:
# Save the daily data to a CSV file
daily_avg.to_csv('../data/weather/atl_weather_daily_averages.csv', index=False)

## Monthly Averages

In [20]:
# Group by month and calculate the mean for each variable
monthly_avg = df.groupby(df['valid'].dt.to_period('M')).agg({'tmpf': 'mean', 'dwpf': 'mean', 'relh': 'mean'}).reset_index()

# Round the mean values to the 2nd decimal place
monthly_avg['tmpf'] = monthly_avg['tmpf'].round(2)
monthly_avg['dwpf'] = monthly_avg['dwpf'].round(2)
monthly_avg['relh'] = monthly_avg['relh'].round(2)

# Rename the columns for clarity
monthly_avg.columns = ['FlightMonth', 'MonthlyAvgTemperature', 'MonthlyAvgDewPoint', 'MonthlyAvgRelativeHumidity']

monthly_avg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 4 columns):
 #   Column                      Non-Null Count  Dtype    
---  ------                      --------------  -----    
 0   FlightMonth                 24 non-null     period[M]
 1   MonthlyAvgTemperature       24 non-null     float64  
 2   MonthlyAvgDewPoint          24 non-null     float64  
 3   MonthlyAvgRelativeHumidity  24 non-null     float64  
dtypes: float64(3), period[M](1)
memory usage: 896.0 bytes


In [21]:
monthly_avg.head()

Unnamed: 0,FlightMonth,MonthlyAvgTemperature,MonthlyAvgDewPoint,MonthlyAvgRelativeHumidity
0,2021-08,78.67,69.68,75.58
1,2021-09,73.56,63.83,73.74
2,2021-10,66.32,57.69,75.78
3,2021-11,52.52,37.33,59.47
4,2021-12,56.46,48.8,77.46


In [22]:
print(monthly_avg.shape)

(24, 4)


In [23]:
# Save the monthly data to a CSV file
monthly_avg.to_csv('../data/weather/atl_weather_monthly_averages.csv', index=False)