# [CoLab 6](https://eds-217-essential-python.github.io/course-materials/coding-colabs/6b_advanced_data_manipulation.html)

09/10/2024


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the temperature anomaly dataset
temp_url = "https://bit.ly/monthly_temp"
temp_df = pd.read_csv(temp_url, parse_dates=['Date'])

# Load the CO2 concentration dataset
co2_url = "https://bit.ly/monthly_CO2"
co2_df = pd.read_csv(co2_url, parse_dates=['Date'])

print("Temperature data:")
print(temp_df.head())
print("\nCO2 data:")
print(co2_df.head())


## Task 1: Data Prep

In [None]:
# set date column as index for both dataframes
temp_df.set_index('Date', inplace = True)
print(temp_df.head())

co2_df.set_index('Date', inplace = True)
print(co2_df.head())

In [None]:
# ensure no missing values

temp_df.isnull().sum()
co2_df.isnull().sum()

## Task 2 - Joining Datasets

In [None]:
# merge temp and co2 based on date index
merged_df = pd.merge(temp_df, co2_df, on = 'Date', how = 'inner')
print(merged_df.head())

In [None]:
# handle any missing values introduced by the merge
merged_df.isnull().sum()

In [None]:
# Create some plots showing temperature anomalies and CO2 concentrations over time using pandas built-in plotting functions.
merged_df.plot.line(subplots=True)

## Task 3: Time Series Analysis

In [None]:
# 1. Resample data to annual averages (group by year and aggregate by mean)
# explore data types
# use .apply to access index values

In [None]:
merged_df.head()

In [None]:
# using apply() on an index. use `to_series()` of the index object and then pass that to your apply() command

merged_df.index  # returns an index object. need a series

In [None]:
#merge_df['annual_ave'] = 

In [None]:
# merged_df.index.to_series().apply(function)
# make a new column that pulls the year out of date

#merged_df['year'] = merged_df.index.year

In [None]:
agg_merge = merged_df.groupby(merged_df.index.year).agg({
    'MonthlyAnomaly': 'mean',
    'CO2Concentration': 'mean'})
agg_merge

In [None]:
# Calculate the year-over-year change in temperature anomalies and CO2 concentrations.

agg_merge[['anomaly_change', 'co2_change']] = agg_merge[['MonthlyAnomaly', 'CO2Concentration']].diff()

agg_merge

In [None]:
# Create a scatter plot (use the plt.scatter() function) of annual temperature anomalies vs CO2 concentrations.

#agg_merge.plot.scatter(x = 'anomaly_change', y = 'co2_change')
plt.scatter(x = agg_merge['anomaly_change'], y = agg_merge['co2_change'])


## Task 4: Seasonal Analysis

In [None]:
# Create a function to extract the season from a given date 
# (hint: use the date.month attribute and if-elif-else to assign the season in your function).

def season_func(date):
    if (date.month >= 6) & (date.month <= 8):
        return("Summer")
    elif (date.month >= 9) & (date.month <= 11):
        return("Fall")
    elif (date.month == 12) | (date.month <= 2):
        return("Winter")
    elif (date.month >= 3) & (date.month <= 5):
        return("Spring")
    else:
        return("Error")

In [None]:
# Use the function to create a new column called Season

merged_df['Season'] = merged_df.index.to_series().apply(season_func)
merged_df

In [None]:
# Calculate the average temperature anomaly and CO2 concentration for each season.

seasonal_avg = merged_df.groupby('Season').agg({
    'MonthlyAnomaly': 'mean',
    'CO2Concentration': 'mean'
})

seasonal_avg

In [None]:

# Create a box plot using sns.boxplot() showing the dist. of temp anomalies for each season


#sns.boxplot(x = seasonal_avg.index, y=seasonal_avg['MonthlyAnomaly'])
sns.boxplot(x = merged_df['Season'], y=merged_df['MonthlyAnomaly'])
