# Pandas foundations

In [None]:
# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations
# For example, you can use the DataFrame attribute values to represent a DataFrame df as a NumPy array
# You can also pass pandas data structures to NumPy methods

In [None]:


# Slicing on rows, columns and strides using iloc for positional slicing or loc based on labels
# This slices rows then columns then stride, e.g. iloc(:5,:) gives the first 5 rows of the dataframe and iloc(:5)
df.iloc(:5,:)
df.iloc(:-5,:)

In [None]:
# Specifying head method
df.head(50)
df.head(15)

In [None]:
# Tail
df.tail(10)
df.tail(20)

In [None]:
# Info - number of rows and columns and type of columns
df.info()

In [None]:
# Broadcasting to assign a value to a slice
# Here :: indicates the slice is every 3rd row from 0 in the last column
df.iloc[::3,-1] = np.nan

In [None]:
# The columns of a dataframe are a specialised data structure called a 'series'
# A series is a 1 dimensional labelled numpy array
type(df['First column'])

# The values attributes create a numpy array from the series
df['First column'].values

# Creating dataframes in Python

In [None]:
# 1) read in data
# 2) create a dictionary ... where the keys of the dictionary become the column names in the resulting dataframe
# 3) from lists

# 1) read in data

In [None]:
# Useful keyword options:
# - names: assigning column labels
# - index_col: assigning index 
# - parse_dates: parsing datetimes 
# - na_values: parsing NaNs

# Read in the file: df1
df1 = pd.read_csv('world_population.csv')

# Create a list of the new column labels: new_labels
new_labels = ['year','population']

# Read in the file, specifying the header and names parameters: df2
df2 = pd.read_csv('world_population.csv', header=0, names=new_labels)

# Print both the DataFrames
print(df1)
print(df2)

In [None]:
# Not all data files are clean and tidy
# Pandas provides methods for reading those not-so-perfect data files that you encounter far too often

In [None]:
# Read the raw file as-is: df1
df1 = pd.read_csv(file_messy)

# Print the output of df1.head()
print(df1.head())

# Read in the file with the correct parameters: df2
df2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')

# Print the output of df2.head()
print(df2.head())

# Save the cleaned up DataFrame to a CSV file without the index
df2.to_csv(file_clean, index=False)

# Save the cleaned up DataFrame to an excel file without the index
df2.to_excel('file_clean.xlsx', index=False)

# 2) from a dictionary

In [4]:
import pandas as pd
data = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
        'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
        'visitors': [139, 237, 326, 456],
        'signups': [7, 12, 3, 5]}
users = pd.DataFrame(data)
print(users)

     city  signups  visitors weekday
0  Austin        7       139     Sun
1  Dallas       12       237     Sun
2  Austin        3       326     Mon
3  Dallas        5       456     Mon


# 3) from lists

In [5]:
import pandas as pd
cities = ['Austin', 'Dallas', 'Austin', 'Dallas']
signups = [7, 12, 3, 5]
visitors = [139, 237, 326, 456]
weekdays = ['Sun', 'Sun', 'Mon', 'Mon']
list_labels = ['city', 'signups', 'visitors', 'weekday']
list_cols = [cities, signups, visitors, weekdays]
# The list and zip functions create a list of tuples
zipped = list(zip(list_labels, list_cols))
print(zipped)
# The dict function then converts the lists to a dictionary
data = dict(zipped)  
users = pd.DataFrame(data)
print(users)

[('city', ['Austin', 'Dallas', 'Austin', 'Dallas']), ('signups', [7, 12, 3, 5]), ('visitors', [139, 237, 326, 456]), ('weekday', ['Sun', 'Sun', 'Mon', 'Mon'])]
     city  signups  visitors weekday
0  Austin        7       139     Sun
1  Dallas       12       237     Sun
2  Austin        3       326     Mon
3  Dallas        5       456     Mon


In [None]:
# Zip the 2 lists together into one list of (key,value) tuples: zipped
zipped = list(zip(list_keys,list_values))

# Inspect the list using print()
print(zipped)

# Build a dictionary with the zipped list: data
data = dict(zipped)

# Build and inspect a DataFrame from the dictionary: df
df = pd.DataFrame(data)
print(df)

In [None]:
# Broadcasting
# Add a new column
users['fees'] = 0 # Broadcasts to entire column

In [None]:
# Broadcasting with a dict
import pandas as pd
heights = [ 59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1 ]
data = {'height': heights, 'sex': 'M'}
# Here the value 'M' is broadcast to the entire column
results = pd.DataFrame(data)
print(results)

In [None]:
# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames
# Make a string with the value 'PA': state
state = 'PA'

# Construct a dictionary: data
data = {'state': state, 'city':cities}

# Construct a DataFrame from dictionary data: df
df = pd.DataFrame(data)

# Print the DataFrame
print(df)

# Change the columns and indexs using the .columns and .index attributes of the dataframe

In [None]:
# Indexs and columns
results.columns = ['height (in)', 'sex']
# We can assign lists of strings as the row index as long as they are of the right length!
results.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
print(results)

In [None]:
# You can use the DataFrame attribute df.columns to view and assign new string labels to columns in a pandas DataFrame
# Build a list of labels: list_labels
list_labels = ['year','artist','song','chart weeks']

# Assign the list of labels to the columns attribute: df.columns
df.columns = list_labels

# Visual exploratory analysis

# Basic plotting - see also data visualisation code

In [None]:
# Plot all columns (default)
df.plot()
plt.show()

# Plot all columns as subplots
df.plot(subplots=True)
plt.show()

# Plot just the Dew Point data
column_list1 = ["Dew Point (deg F)"]
df[column_list1].plot()
plt.show()

# Plot the Dew Point and Temperature data, but not the Pressure data
column_list2 = ['Temperature (deg F)','Dew Point (deg F)']
df[column_list2].plot()
plt.show()

# Basic line plot

In [16]:
# Create a list of y-axis column names: y_columns
y_columns = ['AAPL', 'IBM']

# Generate a line plot
df.plot(x='Month', y=y_columns)

# Add the title
plt.title('Monthly stock prices')

# Add the y-axis label
plt.ylabel('Price ($US)')

# Display the plot
plt.show()

# Scatter plot

In [None]:
# Generate a scatter plot
df.plot(kind='scatter', x='hp', y='mpg', s=sizes)

# Add the title
plt.title('Fuel efficiency vs Horse-power')

# Add the x-axis label
plt.xlabel('Horse-power')

# Add the y-axis label
plt.ylabel('Fuel efficiency (mpg)')

# Display the plot
plt.show()

# Box plot

In [None]:
# Make a list of the column names to be plotted: cols
cols = ['weight', 'mpg']

# Generate the box plots
df[cols].plot(kind='box', subplots=True)

# Display the plot
plt.show()

# Histograms of PDF and CDF

In [None]:
# This formats the plots such that they appear on separate rows
fig, axes = plt.subplots(nrows=2, ncols=1)

# Plot the PDF
df['fraction'].plot(ax=axes[0], kind='hist', normed=True, bins=30, range=(0,.3))
plt.show()

# Plot the CDF
df['fraction'].plot(ax=axes[1], kind='hist', normed=True, cumulative=True, bins=30, range=(0,.3))
plt.show()

# Statistical exploratory analysis

In [None]:
# Lists of summary stats: counts (number of non-null stats),mean (ignores null entries), stds, quartiles
df.describe()
df.count()
df.mean()
df.std()
df.median()
# Quantile is another name for a percentile as a fraction. The default is the median
df.quantile()
# Specifying q as a list of 0.25 and 0.75 returns both 25th and 75th percentiles or the interquartile range
q = [0.25,0.75]
df.quantile(q)
# Min and max
df.max()
df.min()

In [None]:
# Print the minimum value of the Engineering column
print(df['Engineering'].min())

# Print the maximum value of the Engineering column
print(df['Engineering'].max())

# Construct the mean percentage per year: mean
mean = df.mean(axis='columns')

# Plot the average percentage per year
mean.plot()

# Display the plot
plt.show()

In [None]:
# In many data sets, there can be large differences in the mean and median value due to the presence of outliers
# Print summary statistics of the fare column with .describe()
print(df['fare'].describe())

# Generate a box plot of the fare column
df['fare'].plot(kind='box')

# Show the plot
plt.show()

In [None]:
# Print the number of countries reported in 2015
print(df['2015'].count())

# Print the 5th and 95th percentiles
print(df.quantile([0.05,0.95]))

# Generate a box plot
years = ['1800','1850','1900','1950','2000']
df[years].plot(kind='box')
plt.show()

In [None]:
# Print the mean of the January and March data
print(january.mean(), march.mean())

# Print the standard deviation of the January and March data
print(january.std(), march.std())

# Uniques and factors

In [None]:
df['A'].unique()

In [None]:
# Filtering by species
indices = iris['species'] == 'setosa'
setosa = iris.loc[indices,:] # extract new DataFrame
indices = iris['species'] == 'versicolor'
versicolor = iris.loc[indices,:] # extract new DataFrame
indices = iris['species'] == 'virginica'
virginica = iris.loc[indices,:] # extract new DataFrame

# Computing the differences in statistics between different groups within a dataframe

In [None]:
# Compute the global mean and global standard deviation: global_mean, global_std
global_mean = df.mean()
global_std = df.std()

# Filter the US population from the origin column: us
us = df[df['origin'] == 'US']

# Compute the US mean and US standard deviation: us_mean, us_std
us_mean = us.mean()
us_std = us.std()

# Print the differences
print(us_mean - global_mean)
print(us_std - global_std)

In [None]:
# Population filtering can be used alongside plotting to quickly determine differences in distributions between the sub-populations
# Display the box plots on 3 separate rows and 1 column
fig, axes = plt.subplots(nrows=3, ncols=1)

# Generate a box plot of the fare prices for the First passenger class
titanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')

# Generate a box plot of the fare prices for the Second passenger class
titanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')

# Generate a box plot of the fare prices for the Third passenger class
titanic.loc[titanic['pclass'] == 3].plot(ax=axes[2], y='fare', kind='box')

# Display the plot
plt.show()

# Indexing pandas time series

# Using pandas to read datetime objects

In [None]:
# read_csv() function can read strings into datetime objects. You eed to specify ‘parse_dates=True’ 
#ISO 8601 format which is yyyy-mm-dd hh:mm:ss

In [None]:
sales = pd.read_csv('sales-feb-2015.csv', parse_dates=True, index_col = 'Date')
sales.head()
sales.info()
# Selecting a single datetime
sales.loc['2015-02-19 11:00:00', 'Company']
# Selecting a whole day
sales.loc['2015-2-5']
# Partial datetime string selection
# Alternative ways of selecting days
sales.loc[‘February 5, 2015’]
sales.loc[‘2015-Feb-5’] 
# Selecting a whole month
sales.loc[‘2015-2’]
# Selecting a whole year
sales.loc[‘2015’]
# Slicing using datetimes
sales.loc['2015-2-16':'2015-2-20']
# Converting strings to datetimes
evening_2_11 = pd.to_datetime(['2015-2-11 20:00', '2015-2-11 21:00', '2015-2-11 22:00', '2015-2-11 23:00'])
# Reindexing with forward fill
sales.reindex(evening_2_11, method='ffill')
# Reindexing with back fill
sales.reindex(evening_2_11, method='bfill')

In [None]:
# Prepare a format string: time_format
time_format = '%Y-%m-%d %H:%M'

# Convert date_list into a datetime object: my_datetimes
my_datetimes = pd.to_datetime(date_list, format=time_format)  

# Construct a pandas Series using temperature_list and my_datetimes: time_series
time_series = pd.Series(temperature_list, index=my_datetimes)

# Slicing ranges of time series

In [None]:
# Extract the hour from 9pm to 10pm on '2010-10-11': ts1
ts1 = ts0.loc['2010-10-11 21:00:00']

# Extract '2010-07-04' from ts0: ts2
ts2 = ts0.loc['2010-07-04']

# Extract data from '2010-12-15' to '2010-12-31': ts3
ts3 = ts0.loc['2010-12-15':'2010-12-31']

In [None]:
climate2010['2010-05-31 22:00:00'] # datetime
climate2010['2010-06-01'] # Entire day
climate2010['2010-04'] # Entire month
climate2010[‘2010-09':'2010-10'] # 2 months

# Reindexing time series

In [None]:
# Reindexing is useful in preparation for adding or otherwise combining two time series data sets. 
# To reindex the data, we provide a new index and ask pandas to try and match the old data to the new index. 
# If data is unavailble for one of the new index dates or times, you must tell pandas how to fill it in. 
# Otherwise, pandas will fill with NaN by default.

In [None]:
# Reindex without fill method: ts3
ts3 = ts2.reindex(ts1.index)

# Reindex with fill method, using forward fill: ts4
ts4 = ts2.reindex(ts1.index, method='ffill')

# Combine ts1 + ts2: sum12
sum12 = ts1 + ts2

# Combine ts1 + ts3: sum13
sum13 = ts1 + ts3

# Combine ts1 + ts4: sum14
sum14 = ts1 + ts4

# Resampling time series data

In [None]:
# Statistical methods over different time intervals mean(), sum(), count(), etc.
# Down-sampling reduces datetime rows to slower frequency
# Up-sampling increases datetime rows to faster frequency
# Aggregating means
daily_mean = sales.resample('D').mean()
# Summing daily sales using method chaining
sales.resample('D').sum()
# Calculating maximum daily sales using method chaining
sales.resample('D').sum().max()
# Resampling on strings, e.g. counting variables by week
sales.resample('W').count()
# Other option: 
# ‘min’, ‘ T’: minute
# ‘H’: hour
# ‘D’: day
# ‘B’: business day
# ‘W’: week
# ‘M’: month
# ‘Q’: quarter
# ‘A’: year

# Calculating sum of units every fortnight!
sales.loc[:,'Units'].resample('2W').sum()

# Up-sampling and filling
two_days = sales.loc['2015-2-4': '2015-2-5', 'Units']
two_days.resample('4H').ffill()



In [None]:
# When downsampling or upsampling, the syntax is similar, but the methods called are different. 
# Both use the concept of 'method chaining' - df.method1().method2().method3() - to direct the output from one method call to the input of the next, and so on, as a sequence of operations, one feeding into the next.

# Downsample to 6 hour data and aggregate by mean: df1
df1 = df['Temperature'].resample('6h').mean()

# Downsample to daily data and count the number of data points: df2
df2 = df['Temperature'].resample('D').count()

# Separating and resampling

In [None]:
# Extract temperature data for August: august
august = df['Temperature']['2010-August']

# Downsample to obtain only the daily highest temperatures in August: august_highs
august_highs = august.resample('D').max()

# Extract temperature data for February: february
february = df['Temperature']['2010-Feb']

# Downsample to obtain the daily lowest temperatures in February: february_lows
february_lows = february.resample('D').min()

# Rolling mean and frequency

In [19]:
# Rolling means (or moving averages) are generally used to smooth out short-term fluctuations in time series data and highlight long-term trends
# To use the .rolling() method, you must always use method chaining, first calling .rolling() and then chaining an aggregation method after it
# For example, with a Series hourly_data, hourly_data.rolling(window=24).mean() would compute new values for each hourly point, based on a 24-hour window stretching out behind each point
# The frequency of the output data is the same: it is still hourly. 
# Such an operation is useful for smoothing time series data.

# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed
unsmoothed = df['Temperature']['2010-Aug-01':'2010-Aug-15']

# Apply a rolling mean with a 24 hour window: smoothed
smoothed = unsmoothed.rolling(window=24).mean()

# Create a new DataFrame with columns smoothed and unsmoothed: august
august = pd.DataFrame({'smoothed':smoothed, 'unsmoothed':unsmoothed})

# Plot both smoothed and unsmoothed data using august.plot().
august.plot()
plt.show()

# Resampling and rolling average chained together

In [None]:
# Resample and roll with it
# Extract the August 2010 data: august
august = df['Temperature']['2010-8']

# Resample to daily data, aggregating by max: daily_highs
daily_highs = august.resample('D').max()

# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August
daily_highs_smoothed = august.resample('D').max().rolling(window=7).mean()
print(daily_highs_smoothed)

# Manipulating time series data

In [None]:
# Using dt to convert dates
sales['Date'].dt.hour

In [None]:
# Boolean reduction
sales['Product'].str.contains('ware').sum()

In [None]:
# Strip extra whitespace from the column names: df.columns
df.columns = df.columns.str.strip()

# Extract data for which the destination airport is Dallas: dallas
dallas = df['Destination Airport'].str.contains('DAL')

# Compute the total number of Dallas departures each day: daily_departures
daily_departures = dallas.resample('D').sum()

# Generate the summary statistics for daily Dallas departures: stats
stats = daily_departures.describe()

# Missing values and interpolation

In [None]:
# Linear interpolation (other method available!)
population.resample('A').first().interpolate('linear')

In [None]:
# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp
ts2_interp = ts2.reindex(ts1.index).interpolate(how='linear')

# Compute the absolute difference of ts1 and ts2_interp: differences 
differences = np.abs(ts1 - ts2_interp)

# Generate and print summary statistics of the differences
print(differences.describe())

# Time zones and conversion

In [None]:
# Time zone handling with pandas typically assumes that you are handling the Index of the Series
# Localise time zone
central = sales['Date'].dt.tz_localize('US/Central')
# Convert time zone
central.dt.tz_convert('US/Eastern')
# Method chaining to localise and convert time zones in one go!
sales['Date'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern')

In [None]:
# Buid a Boolean mask to filter out all the 'LAX' departure flights: mask
mask = df['Destination Airport'] == 'LAX'

# Use the mask to subset the data: la
la = df[mask]

# Combine two columns of data to create a datetime series: times_tz_none 
times_tz_none = pd.to_datetime( la['Date (MM/DD/YYYY)'] + ' ' + la['Wheels-off Time'])

# Localize the time to US/Central: times_tz_central
times_tz_central = times_tz_none.dt.tz_localize('US/Central')

# Convert the datetimes from US/Central to US/Pacific
times_tz_pacific = times_tz_central.dt.tz_convert('US/Pacific')

# Plotting time series

In [None]:
sp500.loc['2012', ['Close','Volume']].plot(subplots=True)
plt.show()

In [None]:
# Plotting time series, datetime indexing

# Plot the raw data before setting the datetime index
df.plot()
plt.show()

# Convert the 'Date' column into a collection of datetime objects: df.Date
df.Date = pd.to_datetime(df.Date)

# Set the index to be the converted 'Date' column
df.set_index('Date', inplace=True)

# Re-plot the DataFrame to see that the axis is now datetime aware!
df.plot()
plt.show()

In [None]:
# Plotting date ranges, partial indexing
# Now that you have set the DatetimeIndex in your DataFrame, you have a much more powerful and flexible set of tools to use when plotting your time series data
# Of these, one of the most convenient is partial string indexing and slicing

# Plot the summer data
df.Temperature['2010-Jun':'2010-Aug'].plot()
plt.show()
plt.clf()

# Plot the one week data
df.Temperature['2010-06-10':'2010-06-17'].plot()
plt.show()
plt.clf()

# Case study on weather data

# Reading and cleaning the data

In [None]:
# Import pandas
import pandas as pd

# Read in the data file: df
df = pd.read_csv(file_name)

# Print the output of df.head()
print(df.head())

# Read in the data file with header=None: df_headers
df_headers = pd.read_csv(file_name, header=None)

# Print the output of df_headers.head()
print(df_headers.head())

In [None]:
# Split on the comma to create a list: column_labels_list
column_labels_list = column_labels.split(',')

# Assign the new column labels to the DataFrame: df.columns
df.columns = column_labels_list

# Remove the appropriate columns: df_dropped
df_dropped = df.drop(list_to_drop, axis='columns')

# Print the output of df_dropped.head()
print(df_dropped.head())

# Cleaning and tidying datetime data

In [None]:
# In order to use the full power of pandas time series, you must construct a DatetimeIndex

# Convert the date column to string: df_dropped['date']
df_dropped['date'] = df_dropped['date'].astype(str)

# Pad leading zeros to the Time column: df_dropped['Time']
df_dropped['Time'] = df_dropped['Time'].apply(lambda x:'{:0>4}'.format(x))

# Concatenate the new date and Time columns: date_string
date_string = df_dropped['date'] + df_dropped['Time']

# Convert the date_string Series to datetime: date_times
date_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')

# Set the index to be the new date_times container: df_clean
df_clean = df_dropped.set_index(date_times)

# Print the output of df_clean.head()
print(df_clean.head())

# Cleaning the numeric columns

In [None]:
# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011
print(df_clean.loc['2011-6-20 8:00:00':'2011-6-20 9:00:00', 'dry_bulb_faren'])

# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']
df_clean['dry_bulb_faren'] = pd.to_numeric(df_clean['dry_bulb_faren'], errors='coerce')

# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011
print(df_clean.loc['2011-6-20 8:00:00':'2011-6-20 9:00:00', 'dry_bulb_faren'])

# Convert the wind_speed and dew_point_faren columns to numeric values
df_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors='coerce')
df_clean['dew_point_faren'] = pd.to_numeric(df_clean['dew_point_faren'], errors='coerce')

# Apply statistical exploratory data analysis

In [None]:
# Print the median of the dry_bulb_faren column
print(df_clean['dry_bulb_faren'].median())

# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'
print(df_clean.loc['2011-Apr':'2011-Jun', 'dry_bulb_faren'].median())

# Print the median of the dry_bulb_faren column for the month of January
print(df_clean.loc['2011-Jan', 'dry_bulb_faren'].median())

# Signal variance

In [None]:
# Downsample df_clean by day and aggregate by mean: daily_mean_2011
daily_mean_2011 = df_clean.resample('D').mean()

# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011
daily_temp_2011 = daily_mean_2011['dry_bulb_faren'].values

# Downsample df_climate by day and aggregate by mean: daily_mean_2011
daily_climate = df_climate.resample('D').mean()

# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate
daily_temp_climate = daily_climate.reset_index()['Temperature']

# Compute the difference between the two arrays and print the mean difference
difference = daily_temp_2011 - daily_temp_climate
print(difference.mean())

In [None]:
# Select days that are sunny: sunny
sunny = df_clean.loc[df_clean['sky_condition']=='CLR']

# Select days that are overcast: overcast
overcast = df_clean.loc[df_clean['sky_condition'].str.contains('OVC')]

# Resample sunny and overcast, aggregating by maximum daily temperature
sunny_daily_max = sunny.resample('D').max()
overcast_daily_max = overcast.resample('D').max()

# Print the difference between the mean of sunny_daily_max and overcast_daily_max
print(sunny_daily_max.mean() - overcast_daily_max.mean())

# Apply visual exploratory analysis

In [None]:
# Import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Select the visibility and dry_bulb_faren columns and resample them: weekly_mean
weekly_mean = df_clean[['visibility','dry_bulb_faren']].resample('W').mean()

# Print the output of weekly_mean.corr()
print(weekly_mean.corr())

# Plot weekly_mean with subplots=True
weekly_mean.plot(subplots=True)
plt.show()

In [None]:
# Create a Boolean Series for sunny days: sunny
sunny = df_clean['sky_condition'] == 'CLR'

# Resample the Boolean Series by day and compute the sum: sunny_hours
sunny_hours = sunny.resample('D').sum()

# Resample the Boolean Series by day and compute the count: total_hours
total_hours = sunny.resample('D').count()

# Divide sunny_hours by total_hours: sunny_fraction
sunny_fraction = sunny_hours / total_hours

# Make a box plot of sunny_fraction
sunny_fraction.plot(kind='box')
plt.show()

In [None]:
# Resample dew_point_faren and dry_bulb_faren by Month, aggregating the maximum values: monthly_max
monthly_max = df_clean[['dew_point_faren','dry_bulb_faren']].resample('M').max()

# Generate a histogram with bins=8, alpha=0.5, subplots=True
monthly_max.plot(kind='hist', bins=8, alpha=0.5, subplots=True)

# Show the plot
plt.show()

In [None]:
# Extract the maximum temperature in August 2010 from df_climate: august_max
august_max = df_climate.loc['2010-Aug','Temperature'].max()
print(august_max)

# Resample the August 2011 temperatures in df_clean by day and aggregate the maximum value: august_2011
august_2011 = df_clean.loc['2011-Aug','dry_bulb_faren'].resample('D').max()

# Filter out days in august_2011 where the value exceeded august_max: august_2011_high
august_2011_high = august_2011.loc[august_2011>august_max]

# Construct a CDF of august_2011_high
august_2011_high.plot(kind='hist', bins=25, normed=True, cumulative=True)

# Display the plot
plt.show()