# Data ingestion & inspection

** NumPy and pandas working together ** 

In [None]:
# Import numpy
import numpy as np

# Create array of DataFrame values: np_vals
np_vals = df.values

# Create new array of base 10 logarithm values: np_vals_log10
np_vals_log10 = np.log10(np_vals)

# Create array of new DataFrame by passing df to np.log10(): df_log10
df_log10 = np.log10(df)

# Print original and new data containers
print(type(np_vals), type(np_vals_log10))
print(type(df), type(df_log10))


** Zip lists to build dataframes ** 

In [None]:
# Zip the 2 lists together into one list of (key,value) tuples: zipped
zipped = list(zip(list_keys, list_values))

# Inspect the list using print()
print(zipped)

# Build a dictionary with the zipped list: data
data = dict(zipped)

# Build and inspect a DataFrame from the dictionary: df
df = pd.DataFrame(data)
print(df)


** Naming columns **

In [None]:
# Build a list of labels: list_labels
list_labels = list['year', 'artist', 'song', 'chart weeks']

# Assign the list of labels to the columns attribute: df.columns
df.columns = list_labels


# EDA 

** pandas hist, pdf and cdf **

In [None]:
# This formats the plots such that they appear on separate rows
fig, axes = plt.subplots(nrows=2, ncols=1)

# Plot the PDF
df.fraction.plot(ax=axes[0],kind="hist", normed=True, bins=30, range=(0,.3))
plt.show()

# Plot the CDF
df.fraction.plot(ax=axes[1],kind="hist", normed=True,cumulative=True, bins=30, range=(0,.3))
plt.show()


** Separate and plot **

In [None]:
# Display the box plots on 3 separate rows and 1 column
fig, axes = plt.subplots(nrows=3, ncols=1)

# Generate a box plot of the fare prices for the First passenger class
titanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')

# Generate a box plot of the fare prices for the Second passenger class
titanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')

# Generate a box plot of the fare prices for the Third passenger class
titanic.loc[titanic['pclass'] == 3].plot(ax=axes[2], y='fare', kind='box')

# Display the plot
plt.show()


# Time series in pandas

** Creating and using a DatetimeIndex **

In [None]:
# Prepare a format string: time_format
time_format = '%Y-%m-%d %H:%M'

# Convert date_list into a datetime object: my_datetimes
my_datetimes = pd.to_datetime(date_list, format=time_format)  

# Construct a pandas Series using temperature_list and my_datetimes: time_series
time_series = pd.Series(temperature_list, index=my_datetimes)


** Partial string indexing and slicing **

In [None]:
# Extract the hour from 9pm to 10pm on '2010-10-11': ts1
ts1 = ts0.loc['2010-10-11 21:00:00']

# Extract '2010-07-04' from ts0: ts2
ts2 = ts0.loc['2010-07-04']

# Extract data from '2010-12-15' to '2010-12-31': ts3
ts3 = ts0.loc['2010-12-15':'2010-12-31']


** Reindexing the index **

In [None]:
# Reindex without fill method: ts3
ts3 = ts2.reindex(ts1.index)

# Reindex with fill method, using forward fill: ts4
ts4 = ts2.reindex(ts1.index, method = "ffill")

# Combine ts1 + ts2: sum12
sum12 = ts1 + ts2

# Combine ts1 + ts3: sum13
sum13 = ts1 +ts3

# Combine ts1 + ts4: sum14
sum14 = ts1 + ts4


** Resampling and frequency ** 

In [None]:
# Downsample to 6 hour data and aggregate by mean: df1
df1 = df.Temperature.resample("6h").mean()

# Downsample to daily data and count the number of data points: df2
df2 = df.Temperature.resample("D").count()


** Seperating and resampling **

In [None]:
# Extract temperature data for August: august
august = df.loc["2010-08", "Temperature"]

# Downsample to obtain only the daily highest temperatures in August: august_highs
august_highs = august.resample("D").max()

# Extract temperature data for February: february
february = df.loc["2010-02", "Temperature"]

# Downsample to obtain the daily lowest temperatures in February: february_lows
february_lows = february.resample("D").min()


** Rolling mean and frequency **

In [None]:
# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed
unsmoothed = df['Temperature']["2010-08-01":"2010-08-15"]

# Apply a rolling mean with a 24 hour window: smoothed
smoothed = unsmoothed.rolling(window=24).mean()

# Create a new DataFrame with columns smoothed and unsmoothed: august
august = pd.DataFrame({'smoothed':smoothed, 'unsmoothed':unsmoothed})

# Plot both smoothed and unsmoothed data using august.plot().
august.plot()
plt.show()


** Resample and roll with it **

In [None]:
# Extract the August 2010 data: august
august = df['Temperature']["2010-08"]

# Resample to daily data, aggregating by max: daily_highs
daily_highs = august.resample("D").max()

# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August
daily_highs_smoothed = daily_highs.rolling(window=7).mean()
print(daily_highs_smoothed)



** Method chaining and filtering **

In [None]:
# Strip extra whitespace from the column names: df.columns
df.columns = df.columns.str.strip()

# Extract data for which the destination airport is Dallas: dallas
dallas = df['Destination Airport'].str.contains("DAL")

# Compute the total number of Dallas departures each day: daily_departures
daily_departures = dallas.resample("D").sum()

# Generate the summary statistics for daily Dallas departures: stats
stats = daily_departures.describe()



** Filling in missing values **

In [None]:
# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp
ts2_interp = ts2.reindex(ts1.index).interpolate(how="linear")


# Compute the absolute difference of ts1 and ts2_interp: differences 
differences = np.abs(ts1 - ts2_interp)

# Generate and print summary statistics of the differences
print(differences.describe())


** Timezones and conversions **

In [None]:
# Buid a Boolean mask to filter out all the 'LAX' departure flights: mask
mask = df['Destination Airport'] == "LAX"


# Use the mask to subset the data: la
la = df[mask==True]

# Combine two columns of data to create a datetime series: times_tz_none 
times_tz_none = pd.to_datetime( la['Date (MM/DD/YYYY)'] + ' ' + la['Wheels-off Time'] )

# Localize the time to US/Central: times_tz_central
times_tz_central = times_tz_none.dt.tz_localize("US/Central")

# Convert the datetimes from US/Central to US/Pacific
times_tz_pacific = times_tz_central.dt.tz_convert("US/Pacific")


** Plotting time series, date time index **

In [None]:
# Plot the raw data before setting the datetime index
df.plot()
plt.show()

# Convert the 'Date' column into a collection of datetime objects: df.Date
df["Date"] = pd.to_datetime(df["Date"])

# Set the index to be the converted 'Date' column
df.set_index("Date",inplace=True)

# Re-plot the DataFrame to see that the axis is now datetime aware!
df.plot()
plt.show()


** Plotting date ranges, partial indexing **

In [None]:
# Plot the summer data
df.Temperature["2010-Jun":"2010-Aug"].plot()
plt.show()
plt.clf()

# Plot the one week data
df.Temperature['2010-06-10':'2010-06-17'].plot()
plt.show()
plt.clf()


** CASE STUDY **

** Re-assigning column names ** 

In [None]:
# Split on the comma to create a list: column_labels_list
column_labels_list = column_labels.split(",")

# Assign the new column labels to the DataFrame: df.columns
df.columns = column_labels_list

# Remove the appropriate columns: df_dropped
df_dropped = df.drop(list_to_drop, axis="columns")

# Print the output of df_dropped.head()
print(df_dropped.head())


** Cleaning and tidying datetime data **

In [None]:
# Convert the date column to string: df_dropped['date']
df_dropped['date'] = df_dropped['date'].astype(str)

# Put zeros to empty spaces in the Time column: df_dropped['Time']
df_dropped['Time'] = df_dropped['Time'].apply(lambda x:'{:0>4}'.format(x))

# Concatenate the new date and Time columns: date_string
date_string = df_dropped["date"] + df_dropped["Time"]

# Convert the date_string Series to datetime: date_times
date_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')

# Set the index to be the new date_times container: df_clean
df_clean = df_dropped.set_index(date_times)

# Print the output of df_clean.head()
print(df_clean.head())

** Cleaning numeric columns **

In [None]:
# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011
print(df_clean.loc["2011-06-20-08-00:2011-06-20-09-00", "dry_bulb_faren"])

# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']
df_clean['dry_bulb_faren'] = pd.to_numeric(df_clean['dry_bulb_faren'], errors="coerce")

# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011
print(df_clean.loc["2011-06-20-08-00:2011-06-20-09-00", "dry_bulb_faren"])

# Convert the wind_speed and dew_point_faren columns to numeric values
df_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors="coerce")
df_clean['dew_point_faren'] = pd.to_numeric(df_clean['dew_point_faren'], errors="coerce")

** Min, median, **

In [None]:
# Print the median of the dry_bulb_faren column
print(df_clean.dry_bulb_faren.median())

# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'
print(df_clean.loc["2011-Apr":"2011-Jun", 'dry_bulb_faren'].median())

# Print the median of the dry_bulb_faren column for the month of January
print(df_clean.loc["2011-Jan", 'dry_bulb_faren'].median())

** Signal Variance **

In [None]:
# Downsample df_clean by day and aggregate by mean: daily_mean_2011
daily_mean_2011 = df_clean.resample('D').mean()

# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011
daily_temp_2011 = daily_mean_2011['dry_bulb_faren'].values

# Downsample df_climate by day and aggregate by mean: daily_climate
daily_climate = df_climate.resample('D').mean()

# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate
daily_temp_climate = daily_climate.reset_index()['Temperature']

# Compute the difference between the two arrays and print the mean difference
difference = daily_temp_2011 - daily_temp_climate
print(difference.mean())

** Sunny or cloudy - finding out min and max temperatures acrros days **

In [None]:
# Select days that are sunny: sunny
sunny = df_clean.loc[df_clean["sky_condition"]=="CLR"]

# Select days that are overcast: overcast
overcast = df_clean.loc[df_clean["sky_condition"].str.contains("OVC")]

# Resample sunny and overcast, aggregating by maximum daily temperature
sunny_daily_max = sunny.resample("D").max()
overcast_daily_max = overcast.resample("D").max()

# Print the difference between the mean of sunny_daily_max and overcast_daily_max
print(sunny_daily_max.mean() - overcast_daily_max.mean())


** Weekly average temperature and visibility **

In [None]:
# Import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Select the visibility and dry_bulb_faren columns and resample them: weekly_mean
weekly_mean = df_clean[["visibility","dry_bulb_faren"]].resample("W").mean()

# Print the output of weekly_mean.corr()
print(weekly_mean.corr())

# Plot weekly_mean with subplots=True
weekly_mean.plot(subplots=True)
plt.show()


** Daily hours of clear sky **

In [None]:
# Create a Boolean Series for sunny days: sunny
sunny = df_clean["sky_condition"]=="CLR"


# Resample the Boolean Series by day and compute the sum: sunny_hours
sunny_hours = sunny.resample("D").sum()

# Resample the Boolean Series by day and compute the count: total_hours
total_hours = sunny.resample("D").count()

# Divide sunny_hours by total_hours: sunny_fraction
sunny_fraction = sunny_hours / total_hours

# Make a box plot of sunny_fraction
sunny_fraction.plot(kind="box")
plt.show()

** Heat or humidity **

Dew point is a measure of relative humidity based on pressure and temperature. A dew point above 65 is considered uncomfortable while a temperature above 90 is also considered uncomfortable.

In this exercise, you will explore the maximum temperature and dew point of each month. The columns of interest are 'dew_point_faren' and 'dry_bulb_faren'. After resampling them appropriately to get the maximum temperature and dew point in each month, generate a histogram of these values as subplots. Uncomfortably, you will notice that the maximum dew point is above 65 every month!

In [None]:
# Resample dew_point_faren and dry_bulb_faren by Month, aggregating the maximum values: monthly_max
monthly_max = df_clean[["dew_point_faren", "dry_bulb_faren"]].resample("M").max()

# Generate a histogram with bins=8, alpha=0.5, subplots=True
monthly_max.plot(kind="hist", bins=8, alpha=0.5, subplots=True)

# Show the plot
plt.show()

** Probability of high temperatures **

In [None]:
# Extract the maximum temperature in August 2010 from df_climate: august_max
august_max = df_climate.loc["2010-08", "Temperature"].max()
print(august_max)

# Resample the August 2011 temperatures in df_clean by day and aggregate the maximum value: august_2011
august_2011 = df_clean.loc["2011-08", "dry_bulb_faren"].resample("D").max()

# Filter out days in august_2011 where the value exceeded august_max: august_2011_high
august_2011_high = august_2011[august_2011 >august_max]
august_2011_high
# Construct a CDF of august_2011_high
august_2011_high.plot(kind="hist", bins=25, normed=True, cumulative=True)

# Display the plot
plt.show()
