In [None]:
import pandas as pd
import numpy as np

In [None]:
#read data
df = pd.read_csv("pandas/03_Exercise_NOAA_QCLCD_2011_hourly_13904", index_col= 0)
df.head()


## Cleaning and tidying datetime data

In [None]:
#set time Index

# Convert the date column to string: df_dropped['date']
df['date'] = df['date'].astype(str)

# Pad leading zeros to the Time column: df_dropped['Time']
df['Time'] = df['Time'].apply(lambda x:'{:0>4}'.format(x))

# Concatenate the new date and Time columns: date_string
date_string = df['date'] + df["Time"]

# Convert the date_string Series to datetime: date_times
date_times = pd.to_datetime(date_string, format ='%Y%m%d%H%M')

# Set the index to be the new date_times container: df_clean
df_clean = df.set_index(date_times)

# Print the output of df_clean.head()
df_clean.head()



In [None]:
df_clean.info() # no missing value

In [None]:
'''
If ‘raise’, then invalid parsing will raise an exception
If ‘coerce’, then invalid parsing will be set as NaN
If ‘ignore’, then invalid parsing will return the input

The numeric columns contain missing values labeled as 'M'.
In this exercise, your job is to transform these columns such that
they contain only numeric values and interpret missing data as NaN.
'''


# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011
print(df_clean.loc['2011-06-20 08:00:00':'2011-6-20 10:00:00', "dry_bulb_faren"])

# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']
df_clean['dry_bulb_faren'] = pd.to_numeric(df_clean['dry_bulb_faren'], errors = "coerce")

# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011
print(df_clean.loc['2011-06-20 08:00:00':'2011-06-20 10:00:00', "dry_bulb_faren"])

# # Convert the wind_speed and dew_point_faren columns to numeric values
df_clean['wind_speed'] = pd.to_numeric(df_clean["wind_speed"], errors= "coerce")
df_clean['dew_point_faren'] = pd.to_numeric(df_clean["dew_point_faren"], errors= "coerce")

In [None]:
#看一個slice
df_clean.loc["2011-06-20 08:29:00", :]

In [None]:
#Statistical EDA
df_clean.info()

In [None]:
# Print the median of the dry_bulb_faren column
print(df_clean.dry_bulb_faren.median())

# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'
print(df_clean.loc['2011-Apr':'2011-Jun', 'dry_bulb_faren'].median())

# Print the median of the dry_bulb_faren column for the month of January
print(df_clean.loc['2011-Jan', 'dry_bulb_faren'].median())

In [None]:
#Pandas中的resample，重新採樣，是對原樣本重新處理的一個方法，
#是一個對常規時間序列數據重新採樣和頻率轉換的便捷的方法。


# Downsample df_clean by day and aggregate by mean: daily_mean_2011
daily_mean_2011 = df_clean.resample("D").mean() ##D M Y

# # Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011
daily_temp_2011 = daily_mean_2011["dry_bulb_faren"].values ## .values to drop format

daily_mean_2011.head()

In [None]:
# Downsample df_climate by day and aggregate by mean: daily_climate 
df_climate = pd.read_csv("pandas/weather_data_austin_2010.csv")

date = pd.to_datetime(df_climate["Date"], format ='%Y%m%d ')
df_climate = df_climate.set_index(date)
daily_climate = df_climate.resample("D").mean()

# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate
daily_temp_climate = daily_climate["Temperature"].values
daily_temp_climate
# Compute the difference between the two arrays and print the mean difference
difference = daily_temp_2011 - daily_temp_climate
print(difference.mean())




daily_climate

## find special info

In [None]:
'''
The DataFrame df_clean from previous exercises has been provided 
for you. The column 'sky_condition' provides information about 
whether the day was sunny ('CLR') or overcast ('OVC').
'''
# Select days that are sunny: sunny
sunny = df_clean[df_clean.sky_condition == "CLR"]

# Select days that are overcast: overcast
overcast = df_clean[df_clean.sky_condition.str.contains("OVC")]

# Resample sunny and overcast, aggregating by maximum daily temperature
sunny_daily_max = sunny.resample("D").max()
overcast_daily_max = overcast.resample("D").max()

# Print the difference between the mean of sunny_daily_max and overcast_daily_max
print(sunny_daily_max.mean() - overcast_daily_max.mean())


#### Is there a correlation between temperature and windspeed? 
#### Let's find out.

In [None]:
# Import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Select the visibility and dry_bulb_faren columns and resample them: weekly_mean
weekly_mean = df_clean.loc[:,['wind_speed','dry_bulb_faren']].resample("W").mean()

# Print the output of weekly_mean.corr()
print(weekly_mean.corr())

# Plot weekly_mean with subplots=True
weekly_mean.plot(subplots=True)
plt.show()


## Daily hours of clear sky

In [None]:
# Create a Boolean Series for sunny days: sunny
sunny = df_clean.sky_condition == "CLR"

# Resample the Boolean Series by day and compute the sum: sunny_hours
sunny_hours = sunny.resample("D").sum() ##sum by day

# Resample the Boolean Series by day and compute the count: total_hours
total_hours = sunny.resample("D").count()
total_hours
# Divide sunny_hours by total_hours: sunny_fraction
sunny_fraction = sunny_hours / total_hours

# Make a box plot of sunny_fraction
sunny_fraction.plot(kind="box")
plt.show()


## Heat or humidity

#you will explore the maximum temperature and dew point of each month

In [None]:
# Resample dew_point_faren and dry_bulb_faren by Month, aggregating the maximum values: monthly_max
monthly_max = df_clean.loc[:,['dew_point_faren',"dry_bulb_faren"]].resample("M").max()

# Generate a histogram with bins=8, alpha=0.5, subplots=True
monthly_max.plot(kind = "hist",bins = 8, alpha = 0.5, subplots = True)

# Show the plot
plt.show()


### Probability of high temperatures


In [None]:
# Extract the maximum temperature in August 2010 from df_climate: august_max
august_max = df_climate.loc["2010-08","Temperature"].max()
print(august_max)

# Resample the August 2011 temperatures in df_clean by day and aggregate the maximum value: august_2011
august_2011 = df_clean.loc['2011-08', "dry_bulb_faren"].resample("D").max()

# Filter out days in august_2011 where the value exceeded august_max: august_2011_high
august_2011_high = august_2011[august_2011> august_max]

# # Construct a CDF of august_2011_high
august_2011_high.plot(kind = "hist", bins = 25, cumulative = True, normed = True)
august_2011_high.plot(kind = "hist", bins = 25, normed = True)

# Display the plot

plt.show()
print(august_2011_high.count())