# Combining Data Wrap-up
If you want to type along with me, use [this notebook](https://humboldt.cloudbank.2i2c.cloud/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fbethanyj0%2Fdata271_sp24&branch=main&urlpath=tree%2Fdata271_sp24%2Fdemos%2Fdata271_demo30_live.ipynb) instead. 
If you don't want to type and want to follow along just by executing the cells, stay in this notebook. 

In [None]:
import numpy as np
import pandas as pd

## Combining Time Series Data

In [None]:
env_data = pd.DataFrame({
    'time': pd.date_range('2025-04-14 08:00', periods=6, freq='1h'),
    'temperature_C': [14.0, 12.5, 13.0, 13.5, 12.0, 14.2],
    'humidity_pct': [85, 83, 80, 78, 76, 75]
})
env_data.sort_values(by='temperature_C',inplace=True)
env_data

In [None]:
precip_data = pd.DataFrame({
    'time': pd.date_range('2025-04-14 07:00', periods=9, freq='1h'),
    'precipitation': [0.1, 0.3, 0, 0, 0.1, 0.05, 0.05, 0.1, 0.1],
})
precip_data

In [None]:
# A standard merge goes in the order of the original left dataframe


In [None]:
# merge_ordered sorts the data too


In [None]:
# merge_ordered also allows you to fill in gaps with a method of choice
pd.merge_ordered(env_data, precip_data,)

In [None]:
# When this is most useful?
precip_data = pd.DataFrame({
    'time': pd.date_range('2025-04-14 07:00', periods=10, freq='30min'),
    'precipitation': [0.1, 0.3, 0, 0, 0.1, 0.05, 0.05, 0.1, 0.1, 0.2],
})
precip_data

In [None]:
# When the frequency of data points is different
env_data

In [None]:
# When the frequency of data points is different


In [None]:
# Merge_ordered also accepts different "how" args
pd.merge_ordered(env_data, precip_data, ... )

In [None]:
# You can also take the approach of "course graining" with resample
hourly_precip = 
hourly_precip

In [None]:
# Merge at course-grained resolution
course = 
course

In [None]:
# When should we be careful about resampling?
precip_data['lat'] = [38.1, 38.1, 38.1, 38.1, 38.1, 38.2, 38.2, 38.2, 38.2, 38.2]
precip_data['lon'] = [-120.1, -120.1, -120.2,-120.2, -120.2,-120.1, -120.1, -120.2,-120.2, -120.2]
precip_data

In [None]:
# Resampling this would lead to inaccurate location data


In [None]:
# In this case, use groupby with pd.Grouper


In [None]:
# Groupby can handle multiple columns
precip_data.groupby(...).sum()

In [None]:
# Get course-grained data for each location
precip_data.groupby([pd.Grouper(key = 'time', freq="1H"),'lat','lon']).mean().reset_index()

### More flexible temporal matches

In [None]:
# Irregular observational data
observations = pd.DataFrame({
    'time': pd.to_datetime([
        '2025-04-14 08:03', '2025-04-14 09:21', '2025-04-14 11:00',
        '2025-04-14 09:45', '2025-04-14 13:27'
    ]),
    'animal': ['fox', 'bear', 'wolf', 'bear', 'wolf'],
    'location': ['ridge', 'valley', 'creek', 'creek', 'ridge']
}).sort_values('time')

observations

In [None]:
env_data = pd.DataFrame({
    'time': pd.date_range('2025-04-14 08:00', periods=6, freq='1h'),
    'temperature_C': [12.0, 12.5, 13.0, 13.5, 14.0, 14.2],
    'humidity_pct': [85, 83, 80, 78, 76, 75]
})
env_data

In [None]:
# Merge ordered takes all unique
pd.merge_ordered(observations, env_data)

In [None]:
# Merge ordered incorrectly duplicates observations
pd.merge_ordered(observations, env_data, ...)

In [None]:
# Remind ourselves of what the originals look like


In [None]:
# Merge the left time points to the previous point from the right


In [None]:
# Merge the left time points to the nearest point from the right 
pd.merge_asof(observations, env_data, on='time',...)

In [None]:
# Merge the left time points to the next point from the right
pd.merge_asof(observations, env_data, on='time', direction=...)

In [None]:
# If they have different names
env_data.rename(columns={'time':'Time'},inplace=True)
pd.merge_asof(observations, env_data, ...)

In [None]:
# Reset
env_data.rename(columns={'Time':'time'},inplace=True)

### If we have spatial information too

In [None]:
observations['lat'] = [38.2, 38.2, 38.2, 38.2, 38.2]
observations['lon'] = [-120.1, -120.1, -120.2,-120.2, -120.2]

In [None]:
env_data['lat'] = [38.2, 38.2, 38.2, 38.2, 38.2,38.2]
env_data['lon'] = [-120.1, -120.1, -120.2,-120.2, -120.2, -120.1]

In [None]:
observations

In [None]:
env_data

In [None]:
# Merge each location on the time column
pd.merge_asof(observations, env_data, on='time')

In [None]:
pd.merge_asof(observations, env_data, on='time', ...)

This could change the distance it reaches to get a match.

In [None]:
# Set a tolerance for how close the points have to be to match
pd.merge_asof(observations, env_data, on='time', by = ['lat','lon'],
              ...)

## Activity

You're analyzing bird behavior with GPS trackers and sound-activated recorders. The GPS trackers log positions every 5 minutes. The sound recorders log audio events (calls, songs, other birds) at irregular times. 

For each bird location, merge the most recent sound event recorded in that area.

In [None]:
bird_gps = pd.DataFrame({
    "time": pd.to_datetime([
        "2023-04-13 08:00", "2023-04-13 08:05", "2023-04-13 08:10"
    ]),
    "bird_id": [101, 101, 101],
    "lat": [40.5, 40.5, 40.5],
    "lon": [-123.8, -123.8, -123.8]
})

sound_events = pd.DataFrame({
    "time": pd.to_datetime([
        "2023-04-13 07:59", "2023-04-13 08:03", "2023-04-13 08:12"
    ]),
    "event_type": ["call", "song", "flight"]
})