In [1]:
import pandas as pd
import numpy as np
import geopy.distance
from tqdm import tqdm
from functools import partial
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, explained_variance_score
import h3
from IPython.display import clear_output
from datetime import datetime
import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [2]:
data = pd.read_csv("data/tag_tracks.csv").rename({
    "Ptt": "ptt",
    "Latitude": "lat",
    "Longitude": "lon",
    "Dates - Date Key → Date": "date",
    "Dates - Date Key → Year": "year",
    "Dates - Date Key → Month": "month",
    "Dates - Date Key → Day": "day",
}, axis=1)
print(data.shape)
data.head()

(7532, 7)


Unnamed: 0,ptt,lat,lon,date,year,month,day
0,129843,54.13176,-166.922615,2013-12-19,2013,12,19
1,129843,54.258072,-166.884086,2013-12-20,2013,12,20
2,129843,54.312433,-166.910525,2013-12-21,2013,12,21
3,129843,54.35828,-166.817057,2013-12-22,2013,12,22
4,129843,54.389694,-166.676901,2013-12-23,2013,12,23


In [3]:
def find_neighbors(h3_index, threshold_km, neighbors_index):
    h3_coords = h3.h3_to_geo(h3_index)
    checked = set()
    neighbors = set()
    distance = 1
    found_neighbors = True
    while found_neighbors:
        found_neighbors = False
        candidates = h3.k_ring(h3_index, distance)
        new_candidates = set(candidates) - checked
        for candidate in new_candidates:
            if geopy.distance.geodesic(h3_coords, h3.h3_to_geo(candidate)).km <= threshold_km:
                neighbors.add(candidate)
                found_neighbors = True
            checked.add(candidate)
        distance += 1
    neighbors_index[h3_index] = neighbors



RESOLUTION = 4
MAX_KM = 100

selected_h3 = set([
    h3.geo_to_h3(lat, lon, RESOLUTION)
    for lat, lon in zip(data['lat'], data['lon'])
])

neighbors_index = {}
for h3_index in tqdm(selected_h3):
    find_neighbors(h3_index, MAX_KM, neighbors_index)

100%|██████████| 738/738 [00:02<00:00, 259.84it/s]


In [25]:
def get_heading(lat1, lon1, lat2, lon2):
    x = lon2 - lon1
    y = lat2 - lat1
    if x == 0 and y == 0:
        return np.nan
    angle = np.arctan2(y, x)
    if angle < 0:
        angle += 2 * np.pi
    return angle

df = pd.DataFrame(
    [[x, y] for x in np.arange(-1, 1.1, 0.1)
    for y in np.arange(-1, 1.1, 0.1)],
    columns=["lon", "lat"]
)
df['size'] = np.sqrt(df['lon'] ** 2 + df['lat'] ** 2)
df['lon'] = df['lon'] / df['size']
df['lat'] = df['lat'] / df['size']
df['heading'] = df.apply(lambda x: get_heading(0, 0, x['lat'], x['lon']), axis=1)
px.scatter(df, x="lon", y="lat", color="heading")

In [4]:
def get_direction(lat1, lon1, lat2, lon2):
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    size = np.linalg.norm([dlat, dlon])
    dlat = dlat / size if size > 0 else np.nan 
    dlon = dlon / size if size > 0 else np.nan
    return dlat, dlon

In [27]:
def create_pairs(data):
    pairs = []
    for ptt in tqdm(data["ptt"].unique()):
        rows = [row for _, row in data[data["ptt"] == ptt].sort_values("date", ascending=True).iterrows()]
        for start, end in zip(rows[:-1], rows[1:]):
            start_h3 = h3.geo_to_h3(start["lat"], start["lon"], RESOLUTION)
            end_h3 = h3.geo_to_h3(end["lat"], end["lon"], RESOLUTION)
            start_lat, start_lon = h3.h3_to_geo(start_h3)
            end_lat, end_lon = h3.h3_to_geo(end_h3)
            heading = get_heading(start_lat, start_lon, end_lat, end_lon)
            new_row = {
                "ptt": ptt,
                "start_lat": start_lat,
                "start_lon": start_lon,
                "end_lat": end_lat,
                "end_lon": end_lon,
                "start_date": start["date"],
                "end_date": end["date"],
                "heading": heading,
                "start_h3": start_h3,
                "end_h3": end_h3,
                "start_month": start["month"],
                "start_day": start["day"],
                "end_month": end["month"],
                "end_day": end["day"],
                "remained": heading is np.nan,
            }
            pairs.append(new_row)
    return pd.DataFrame(pairs)

pairs = create_pairs(data)
print(pairs.shape)
pairs.head()

100%|██████████| 111/111 [00:00<00:00, 232.82it/s]

(7421, 15)





Unnamed: 0,ptt,start_lat,start_lon,end_lat,end_lon,start_date,end_date,heading,start_h3,end_h3,start_month,start_day,end_month,end_day,remained
0,129843,53.98098,-166.800355,54.360925,-166.742418,2013-12-19,2013-12-20,1.419474,8422d0bffffffff,8422d03ffffffff,12,19,12,20,False
1,129843,54.360925,-166.742418,54.360925,-166.742418,2013-12-20,2013-12-21,,8422d03ffffffff,8422d03ffffffff,12,20,12,21,True
2,129843,54.360925,-166.742418,54.360925,-166.742418,2013-12-21,2013-12-22,,8422d03ffffffff,8422d03ffffffff,12,21,12,22,True
3,129843,54.360925,-166.742418,54.360925,-166.742418,2013-12-22,2013-12-23,,8422d03ffffffff,8422d03ffffffff,12,22,12,23,True
4,129843,54.360925,-166.742418,54.360925,-166.742418,2013-12-23,2013-12-24,,8422d03ffffffff,8422d03ffffffff,12,23,12,24,True


In [31]:
def plot_it(data, lat, lon, color):
    fig = px.scatter_geo(
        data, lat=lat, lon=lon, color=color,
    )
    fig.update_layout(autosize=True, height=600, geo=dict(center=dict(lat=58, lon=-150), projection_scale=6))
    return fig

plot_it(pairs, "start_lat", "start_lon", "ptt")

In [32]:
plot_it(pairs[pairs['ptt'] == "129843"], "start_lat", "start_lon", "heading")

In [70]:
def diff_heading(heading1, heading2):
    if heading1 < heading2:
        heading1, heading2 = heading2, heading1

    diff = heading1 - heading2
    return diff if diff <= np.pi else 2 * np.pi - diff

def squared_error_func(headings, heading):
    return np.mean([diff_heading(h, heading) ** 2 for h in headings])

def find_average_heading(headings, error_func=squared_error_func, tolerance=0.001):
    step_size = np.pi / 8
    direction = 1
    proposed_heading = 0
    error = error_func(headings, proposed_heading)
    while step_size >= tolerance:
        proposed_heading = proposed_heading + step_size * direction
        if proposed_heading < 0:
            proposed_heading += 2 * np.pi
        elif proposed_heading > 2 * np.pi:
            proposed_heading -= 2 * np.pi
        new_error = error_func(headings, proposed_heading)
        if new_error > error:
            direction *= -1
            step_size /= 2
        error = new_error
    return proposed_heading



6.211088210148958

In [82]:
df = pairs[pairs['ptt'] == "129843"]
fdf = df[df['remained'] == False]
fdf['rolling_heading'] = fdf['heading'].rolling(5, center=True).agg(find_average_heading)
df = df.merge(fdf[['start_date', 'rolling_heading']], on="start_date")
plot_it(df, "start_lat", "start_lon", "rolling_heading")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [298]:
def group_headings(df, max_allowable_error, min_allowable_distance):
    groups = []
    group = None
    for _, row in df.sort_values('start_date', ascending=True).iterrows():
        if not group:
            # initialize
            group = {
                'rows': [row],
                'headings': [row['heading']] if not row['remained'] else [],
            }
            continue

        if row['remained']:
            group['rows'].append(row)
        else:
            headings = group['headings'] + [row['heading']]
            mean_heading = find_average_heading(headings)
            max_error = max([diff_heading(h, mean_heading) for h in headings])
            if max_error <= MAX_ALLOWABLE_ERROR:
                group['rows'].append(row)
                group['headings'] = headings
            else:
                groups.append(group)
                group = {
                    'rows': [row],
                    'headings': [row['heading']],
                }
    if group:
        groups.append(group)

    group_rows = []
    i = 0
    for group in groups:
        rows = group['rows']
        distance = geopy.distance.geodesic(
            (rows[0]['start_lat'], rows[0]['start_lon']), (rows[-1]['end_lat'], rows[-1]['end_lon'])
        ).km
        mean_heading = find_average_heading(group['headings'])
        if distance >= min_allowable_distance:
            for j, row in enumerate(rows):
                new_row = {
                    "ptt": row["ptt"],
                    "start_date": row["start_date"],
                    "group": i,
                    "steps_in_group": j,
                    "momentum": True,
                    "mean_heading": mean_heading,
                }
                group_rows.append(new_row)
            i += 1
    if group_rows:
        df = df.merge(pd.DataFrame(group_rows), on=["ptt", "start_date"], how="left")
        df['group'] = df['group'].fillna(-1)
        df['momentum'] = df['momentum'].fillna(False)
    else:
        df['group'] = -1
        df['momentum'] = False

    rows = []
    last_ptt = None
    i = 1
    j = 0
    for _, row in df.sort_values(['ptt', 'start_date'], ascending=True).iterrows():
        if row['ptt'] != last_ptt:
            i = 1
            last_ptt = row['ptt']
        if not row['momentum']:
            if i == 1:
                j += 1
            row['steps_since_group'] = i
            row['drift_group'] = j
            i += 1
        else:
            row['steps_since_group'] = np.nan
            row['drift_group'] = np.nan
            i = 1
        rows.append(row)
    df = pd.DataFrame(rows)

    return df


df = group_headings(
    pairs[pairs['ptt'] == "172904"],
    np.pi/4,
    150
)
plot_it(
    df,
    "start_lat",
    "start_lon",
    "group",
)

In [275]:
df = pairs[pairs['ptt'] == "172904"]
df = df.sort_values('start_date').reset_index(drop=True).reset_index()
plot_it(
    df,
    "start_lat",
    "start_lon",
    "heading",
)

In [301]:
grouped_pairs = []
for ptt in tqdm(pairs['ptt'].unique()):
    df = group_headings(
        pairs[pairs['ptt'] == ptt],
        np.pi/4,
        150
    )
    grouped_pairs.append(df)
    clear_output()
grouped_pairs = pd.concat(grouped_pairs)
plot_it(
    grouped_pairs[grouped_pairs['momentum']],
    "start_lat",
    "start_lon",
    "ptt",
)

100%|██████████| 111/111 [00:05<00:00, 21.39it/s]


In [277]:
plot_it(
    grouped_pairs[~grouped_pairs['momentum']],
    "start_lat",
    "start_lon",
    "ptt",
)

In [278]:
def create_options_from_pairs(pairs):
    rows = []
    choice_id = -1
    for _, row in tqdm(pairs.iterrows()):
        choice_id += 1
        start = h3.geo_to_h3(row['start_lat'], row['start_lon'], RESOLUTION)
        end = h3.geo_to_h3(row['end_lat'], row['end_lon'], RESOLUTION)
        date = datetime.strptime(row['end_date'], '%Y-%m-%d')
        
        start_lat, start_lon = h3.h3_to_geo(start)
        end_lat, end_lon = h3.h3_to_geo(end)
        heading = get_heading(start_lat, start_lon, end_lat, end_lon)
        rows.append({
            'ptt': row['ptt'],
            'h3_index': end, 
            'chosen': 1,
            'date': date,
            'year': date.year,
            'month': date.month,
            'day': date.day,
            'choice_id': choice_id,
            'remain_now': start == end,
            'heading': heading,
            'momentum': row['momentum'],
            'group': row['group'],
            'drift_group': row['drift_group'],
            'mean_heading': row['mean_heading'],
            'steps_in_group': row['steps_in_group'],
            'steps_since_group': row['steps_since_group'],
        })
        for neighbor in neighbors_index[start]:
            end_lat, end_lon = h3.h3_to_geo(neighbor)
            heading = get_heading(start_lat, start_lon, end_lat, end_lon)
            if neighbor != end:
                rows.append({
                    'ptt': row['ptt'],
                    'h3_index': neighbor, 
                    'chosen': 0,
                    'date': date,
                    'year': date.year,
                    'month': date.month,
                    'day': date.day,
                    'choice_id': choice_id,
                    'remain_now': start == neighbor,
                    'heading': heading,
                    'momentum': row['momentum'],
                    'group': row['group'],
                    'drift_group': row['drift_group'],
                    'mean_heading': row['mean_heading'],
                    'steps_in_group': row['steps_in_group'],
                    'steps_since_group': row['steps_since_group'],
                })
    return pd.DataFrame(rows)

options = create_options_from_pairs(grouped_pairs)
print(options.shape)
options.head()

0it [00:00, ?it/s]

7421it [00:02, 2841.98it/s]


(144171, 16)


Unnamed: 0,ptt,h3_index,chosen,date,year,month,day,choice_id,remain_now,heading,momentum,group,drift_group,mean_heading,steps_in_group,steps_since_group
0,129843,8422d03ffffffff,1,2013-12-20,2013,12,20,0,False,1.419474,False,-1.0,1.0,,,1.0
1,129843,8422d57ffffffff,0,2013-12-20,2013,12,20,0,False,3.35811,False,-1.0,1.0,,,1.0
2,129843,8422d53ffffffff,0,2013-12-20,2013,12,20,0,False,3.361094,False,-1.0,1.0,,,1.0
3,129843,8422d19ffffffff,0,2013-12-20,2013,12,20,0,False,3.033398,False,-1.0,1.0,,,1.0
4,129843,8422d51ffffffff,0,2013-12-20,2013,12,20,0,False,3.81725,False,-1.0,1.0,,,1.0


In [279]:
training_ptt = set(
    np.random.choice(data['ptt'].unique(), round(data['ptt'].unique().shape[0] * 0.7), replace=False)
)
testing_ptt = set(data['ptt'].unique()) - training_ptt
print(len(training_ptt), len(testing_ptt))

78 33


In [280]:
train = options[options['ptt'].isin(training_ptt)]
test = options[options['ptt'].isin(testing_ptt)]

In [281]:
def spatial_key_to_index(spatial_key):
    return hex(spatial_key)[2:]

elevation = pd.read_csv("data/bathymetry.csv")
elevation['h3_index'] = elevation['h3_index'].astype(np.int64).astype(str)
elevation['h3_index'] = elevation.apply(lambda row: spatial_key_to_index(np.int64(row['h3_index'])), axis=1)
elevation.head()

Unnamed: 0,h3_index,elevation
0,8402105ffffffff,-450.0104
1,840210dffffffff,-1184.517832
2,8402121ffffffff,-499.541915
3,8402123ffffffff,-270.425039
4,8402125ffffffff,-1022.894455


In [282]:
surface_temps = pd.read_csv("data/surface_temps.csv").rename({
    "H3 Key 4": "h3_index",
    "Dates - Date Key → Month": "month",
    "Dates - Date Key → Day": "day",
    "Temperature C": "temp"
}, axis=1)[['h3_index', 'month', 'day', 'temp']]
surface_temps['h3_index'] = surface_temps['h3_index'].astype(np.int64).astype(str)
surface_temps['h3_index'] = surface_temps.apply(lambda row: spatial_key_to_index(np.int64(row['h3_index'])), axis=1)
surface_temps.head()

Unnamed: 0,h3_index,month,day,temp
0,8402101ffffffff,1,15,-0.2685
1,8402103ffffffff,1,15,-0.2685
2,8402105ffffffff,1,15,-0.2685
3,8402107ffffffff,1,15,-0.2685
4,8402109ffffffff,1,15,-0.2685


In [283]:
def add_features(base_data):
    base_data = base_data[[
        'ptt', 'h3_index', 'chosen', 'date', 'year', 'month', 'day', 'choice_id', 'remain_now', 'heading', 'momentum', 'group', 'drift_group', 'mean_heading', 'steps_in_group', 'steps_since_group'
    ]]

    base_data = base_data.merge(elevation, on='h3_index')
    base_data = base_data.merge(surface_temps[['h3_index', 'month', 'temp']], on=['h3_index', 'month'])

    base_data.loc[base_data['remain_now'], 'heading'] = 0

    base_data['lat'] = base_data.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
    base_data['lon'] = base_data.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
    return base_data

train = add_features(train)
test = add_features(test)
train.head()

Unnamed: 0,ptt,h3_index,chosen,date,year,month,day,choice_id,remain_now,heading,momentum,group,drift_group,mean_heading,steps_in_group,steps_since_group,elevation,temp,lat,lon
0,133395,840c9a9ffffffff,1,2014-08-05,2014,8,5,112,False,2.779215,False,-1.0,1.0,,,1.0,-3411.865351,6.813444,57.139992,-175.275782
1,133395,840c9a9ffffffff,1,2014-08-06,2014,8,6,113,True,0.0,False,-1.0,1.0,,,2.0,-3411.865351,6.813444,57.139992,-175.275782
2,133395,840c9a9ffffffff,1,2014-08-07,2014,8,7,114,True,0.0,False,-1.0,1.0,,,3.0,-3411.865351,6.813444,57.139992,-175.275782
3,133395,840c9a9ffffffff,0,2014-08-08,2014,8,8,115,True,0.0,False,-1.0,1.0,,,4.0,-3411.865351,6.813444,57.139992,-175.275782
4,133395,840c9a9ffffffff,0,2014-08-09,2014,8,9,116,False,0.234937,False,-1.0,1.0,,,5.0,-3411.865351,6.813444,57.139992,-175.275782


In [284]:
momentum_train = train[train['momentum']]
momentum_test = test[test['momentum']]

momentum_features = ['elevation', 'temp', 'remain_now', 'heading', 'mean_heading']

X = momentum_train[momentum_features]
y = momentum_train['chosen']

models = {
    "Random Forest": RandomForestRegressor(
        random_state=42, n_jobs=3
    )
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [100, 200]}
}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=cv,
        refit=True,
    ).fit(X, y)
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_), "best_estimator": grid_search.best_estimator_}
    results.append(result)

In [285]:
momentum_model = results[0]['best_estimator']

In [286]:
momentum_train["pred"] = results[0]["best_estimator"].predict(momentum_train[momentum_features])
momentum_train["error"] = momentum_train["chosen"] - momentum_train["pred"]
print(mean_squared_error(momentum_train["chosen"], momentum_train["pred"]))
print(explained_variance_score(momentum_train["chosen"], momentum_train["pred"]))

0.0317594510867331
0.34507382302863177




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [287]:
momentum_test["pred"] = results[0]["best_estimator"].predict(momentum_test[momentum_features])
momentum_test["error"] = momentum_test["chosen"] - momentum_test["pred"]
print(mean_squared_error(momentum_test["chosen"], momentum_test["pred"]))
print(explained_variance_score(momentum_test["chosen"], momentum_test["pred"]))

0.03469019548108828
0.27581044071177785




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [288]:
full_features = elevation
full_features = full_features.merge(surface_temps[['h3_index', 'month', 'temp']], on=['h3_index'])
full_features.head()

Unnamed: 0,h3_index,elevation,month,temp
0,8402105ffffffff,-450.0104,1,-0.2685
1,8402105ffffffff,-450.0104,2,-0.2685
2,8402105ffffffff,-450.0104,3,-0.2685
3,8402105ffffffff,-450.0104,4,-0.2685
4,8402105ffffffff,-450.0104,5,-0.2685


In [289]:
ptts = np.random.choice(data['ptt'].unique(), 50, replace=False)

In [290]:
def create_random_momentum_track(lat, lon, date, mean_heading, model, steps):
    path = [(lat, lon, date)]
    for _ in range(steps):
        h3_index = h3.geo_to_h3(lat, lon, RESOLUTION)
        if h3_index not in neighbors_index:
            find_neighbors(h3_index, MAX_KM, neighbors_index)
        raw_possibilities = list(neighbors_index[h3_index])
        possibilities = pd.DataFrame(raw_possibilities, columns=['h3_index'])
        possibilities['month'] = date.month
        possibilities = possibilities.merge(full_features, on=['h3_index', 'month'])
        if possibilities.shape[0] == 0:
            break

        possibilities['remain_now'] = possibilities['h3_index'] == h3_index
        possibilities['heading'] = possibilities.apply(
            lambda row: get_heading(lat, lon, h3.h3_to_geo(row['h3_index'])[0], h3.h3_to_geo(row['h3_index'])[1]), 
            axis=1
        ).fillna(0)
        possibilities['mean_heading'] = mean_heading
    
        X = possibilities[momentum_features]
        probs = model.predict(X)
        probs = probs / probs.sum()
        raw_possibilities = list(possibilities['h3_index'])
        
        selection = np.random.choice(range(len(raw_possibilities)), p=probs)
        lat, lon = h3.h3_to_geo(raw_possibilities[selection])
        date = date + pd.Timedelta(days=1)
        path.append((lat, lon, date))
    return pd.DataFrame(path, columns=['lat', 'lon', 'date'])

paths = []
for ptt in tqdm(ptts):
    df = grouped_pairs[grouped_pairs['ptt'] == ptt].sort_values("start_date", ascending=True)
    for group in df['group'].unique():
        if group == -1:
            continue

        group_df = df[df['group'] == group].iloc[[0]]
        lat = group_df['start_lat'].values[0]
        lon = group_df['start_lon'].values[0]
        date = datetime.strptime(group_df['start_date'].values[0], '%Y-%m-%d')
        mean_heading = group_df['mean_heading'].values[0]
        path = create_random_momentum_track(
            lat, lon, date, mean_heading,
            momentum_model,
            grouped_pairs[(grouped_pairs['ptt'] == ptt) & (grouped_pairs['group'] == group)].shape[0]
        )
        path['case'] = 'fake'
        path['ptt'] = ptt

        old_path = grouped_pairs[(grouped_pairs['ptt'] == ptt) & (grouped_pairs['group'] == group)]
        old_path['date'] = pd.to_datetime(old_path['start_date'])
        old_path['h3_index'] = old_path.apply(lambda row: h3.geo_to_h3(row['start_lat'], row['start_lon'], RESOLUTION), axis=1)
        old_path['lat'] = old_path.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
        old_path['lon'] = old_path.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
        old_path['case'] = 'real'
        old_path['ptt'] = ptt
        del old_path['h3_index']
        paths.append(pd.concat([path, old_path]))
        clear_output()

100%|██████████| 50/50 [00:28<00:00,  1.75it/s]


In [291]:
df = pd.concat(paths)
plot_it(
    df[df['case'] == 'fake'],
    "lat",
    "lon",
    "ptt"
)

In [292]:
df = pd.concat(paths)
plot_it(
    df[df['case'] == 'real'],
    "lat",
    "lon",
    "ptt"
)

In [293]:
drift_train = train[~train['momentum']]
drift_test = test[~test['momentum']]

drift_features = ['elevation', 'temp', 'remain_now']

X = momentum_train[drift_features]
y = momentum_train['chosen']

models = {
    "Random Forest": RandomForestRegressor(
        random_state=42, n_jobs=3
    )
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [100, 200]}
}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=cv,
        refit=True,
    ).fit(X, y)
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_), "best_estimator": grid_search.best_estimator_}
    results.append(result)

In [294]:
drift_train["pred"] = results[0]["best_estimator"].predict(drift_train[drift_features])
drift_train["error"] = drift_train["chosen"] - drift_train["pred"]
print(mean_squared_error(drift_train["chosen"], drift_train["pred"]))
print(explained_variance_score(drift_train["chosen"], drift_train["pred"]))

0.027108391647037385
0.4565034840625115




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [295]:
drift_test["pred"] = results[0]["best_estimator"].predict(drift_test[drift_features])
drift_test["error"] = drift_test["chosen"] - drift_test["pred"]
print(mean_squared_error(drift_test["chosen"], drift_test["pred"]))
print(explained_variance_score(drift_test["chosen"], drift_test["pred"]))

0.02458048086498183
0.5058878396763522




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [307]:
df = train.groupby(['drift_group', 'ptt']).agg({'steps_since_group': 'max'})
print(df['steps_since_group'].describe())
px.histogram(df, x="steps_since_group")

count    134.000000
mean      26.850746
std       30.034944
min        1.000000
25%        7.000000
50%       22.500000
75%       33.750000
max      259.000000
Name: steps_since_group, dtype: float64


In [308]:
df = train.groupby(['group', 'ptt']).agg({'steps_in_group': 'max'})
print(df['steps_in_group'].describe())
px.histogram(df, x="steps_in_group")

count    108.000000
mean      13.518519
std        9.393258
min        0.000000
25%        7.000000
50%       11.000000
75%       16.000000
max       53.000000
Name: steps_in_group, dtype: float64


In [310]:
train.groupby('momentum').count()[['ptt']]

Unnamed: 0_level_0,ptt
momentum,Unnamed: 1_level_1
False,68121
True,30682


- Change heading (time, el, temp)
- Choose a heading (from drift to heading) (random)
- Leave a heading (from heading to drift) (random)