In [2]:
# Import pandas, numpy, and matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# seaborn is a data visualization library built on matplotlib
import seaborn as sns

# set the plotting style
sns.set_style("whitegrid")
     

In [11]:
# load data sets for seattle and orlando precipitation
df_seattle = pd.read_csv(
    'https://raw.githubusercontent.com/naomi-rlm/Weather-Project/main/data/seattle_rain.csv'
)
df_orlando = pd.read_csv(
     'https://raw.githubusercontent.com/naomi-rlm/Weather-Project/main/data/orlando_rain.csv'
)

In [15]:
# fix date data type
df_seattle['DATE'] = pd.to_datetime(df_seattle['DATE']) 
df_orlando['DATE'] = pd.to_datetime(df_orlando['DATE']) 

In [21]:
# outer merge with the key DATE keeping the columns DATE and PRCP
df = df_orlando[['DATE', 'PRCP']].merge(df_seattle[['DATE', 'PRCP']], on='DATE', how='outer')

In [29]:
# transform data frame to have data under columns 'city' and 'precipitation' while still keeping 'DATE'
df = pd.melt(df, id_vars='DATE', var_name='city', value_name='precipitation')

ValueError: value_name (precipitation) cannot match an element in the DataFrame columns.

In [28]:
df.head()

Unnamed: 0,DATE,city,precipitation
0,2018-01-01,PRCP_x,0.04
1,2018-01-02,PRCP_x,0.01
2,2018-01-03,PRCP_x,1.02
3,2018-01-04,PRCP_x,0.0
4,2018-01-05,PRCP_x,0.0


In [30]:
# change PRCP_x and PRCP_y to their actual city codes
df.loc[df['city'] == 'PRCP_x', 'city'] = 'ORL'
df.loc[df['city'] == 'PRCP_y', 'city'] = 'SEA'

In [33]:
# decapitaize 'DATE' column to make data frame column names more uniform
df = df.rename(columns={'DATE': 'date'})

In [37]:
# determining the number NAs in the data frame for Seattle and Orlando
df.isna().sum()

date               0
city               0
precipitation    192
dtype: int64

In [38]:
df.loc[df['city'] == 'SEA', 'precipitation'].isna().sum()

np.int64(190)

In [39]:
df.loc[df['city'] == 'ORL', 'precipitation'].isna().sum()

np.int64(2)

In [43]:
# define a column that labels each day by the day of the year
df['day_of_year'] = pd.DatetimeIndex(df['date']).day_of_year

In [44]:
# mean precipitation for each day in Seattle, averaged across years
mean_day_precipitation = df.loc[
    df['city'] == 'SEA',
    ['precipitation', 'day_of_year']
].groupby(
    'day_of_year'
).mean()

In [45]:
#  index of each row where precipitation is missing
indices = np.where(df['precipitation'].isna() == True)[0]

In [46]:
for index in indices:
    df.loc[index, 'precipitation'] = mean_day_precipitation.loc[df.loc[index,'day_of_year']].values[0]

In [48]:
df.isna().sum()

date             0
city             0
precipitation    0
day_of_year      0
dtype: int64

In [51]:
df.to_csv('clean_seattle_orlando_weather.csv', encoding='utf-8-sig', index=False)

In [50]:
import os
os.getcwd()

'C:\\Users\\naomi'