In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
from geopy import distance

import functools
import time

RAD_OF_EARTH_IN_M = 6_371_000

# Timing Decorator
def timer(func):
    """Print the runtime of the decorated function"""
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.perf_counter()    # 1
        value = func(*args, **kwargs)
        end_time = time.perf_counter()      # 2
        run_time = end_time - start_time    # 3
        print(f"Finished {func.__name__!r} in {run_time:.4f} secs")
        return value
    return wrapper_timer

def generate_balltree(df):
    '''
        Generate Balltree
    '''
    # return  BallTree(df[['Latitude', 'Longitude']].values, metric=lambda u, v: distance.distance(u, v).miles)
    return  BallTree(df[['Latitude_rad', 'Longitude_rad']].values, metric='haversine')


@timer
def find_and_merge_matches(tree: BallTree, merge_into_df: pd.DataFrame, merge_from_df: pd.DataFrame):
    '''
        Find closest matches in merge_into_df to items in tree
        Return a combined DataFrame
    '''
    distances, indices = tree.query(merge_into_df[['latitude_rad', 'longitude_rad']].values, k = 1)
    indices = list(map(lambda index: index[0], indices))
    distances = np.array(list(map(lambda distance: distance[0], distances)))
    nearest_to_merge = merge_from_df.iloc[indices].reset_index(drop=True)
    new_df = pd.concat(
    [
        merge_into_df.reset_index(drop=True),
        nearest_to_merge,
        pd.Series(distances * RAD_OF_EARTH_IN_M, name='dist')
    ], 
    axis=1)

    return new_df

In [2]:
present_df = pd.read_csv('joro_observations.csv')
climate_df = pd.read_csv("asia_climate.csv")

# Creates new columns converting coordinate degrees to radians.
for column in present_df[["latitude", "longitude"]]:
    rad = np.deg2rad(present_df[column].values)
    present_df[f'{column}_rad'] = rad
for column in climate_df[["Latitude", "Longitude"]]:
    rad = np.deg2rad(climate_df[column].values)
    climate_df[f'{column}_rad'] = rad

print(present_df.head())
print(climate_df.head())

    latitude   longitude  latitude_rad  longitude_rad
0  36.742970  126.583464      0.641286       2.209298
1  36.408825  126.853895      0.635454       2.214018
2  34.690083  135.195511      0.605456       2.359607
3  34.685381  135.124958      0.605374       2.358375
4  34.668723  135.145654      0.605083       2.358737
   Longitude   Latitude  Annual_mean_temp  MeanDiurnalRange  Isothermality  \
0  52.354167  52.104167          4.850833         10.619667      23.523983   
1  52.395833  52.104167          4.814500         10.557000      23.451662   
2  52.437500  52.104167          4.785500         10.526333      23.435598   
3  52.479167  52.104167          4.821167         10.511667      23.377962   
4  52.520833  52.104167          4.833833         10.503000      23.398235   

   TempSeasonality  MaxTempWarmMo  MinTempColdMo  TempAnnRange  \
0      1305.529785      28.648001     -16.496000     45.144001   
1      1303.395264      28.516001     -16.500000     45.015999   
2      13

In [3]:
tree = generate_balltree(climate_df)

In [10]:
joroUpdated = find_and_merge_matches(tree, present_df, climate_df)
joroUpdated = joroUpdated \
    .drop(columns=['Latitude_rad', 'Longitude_rad', 'latitude_rad', 'longitude_rad', 'Latitude', 'Longitude', 'latitude', 'longitude', 'dist']) \
    #.rename(columns={
    #    'Latitude': 'lat_clim',
    #    'Longitude': 'long_clim',
    #    'latitude': 'lat_present',
    #    'longitude': 'long_present',
    #    'dist': 'dist_clim_join'
    #}) \

joroUpdated['class'] = 1

#joroUpdated.to_csv("combined_joro_v2.csv", index=False)
joroUpdated.head()


Finished 'find_and_merge_matches' in 0.2089 secs


Unnamed: 0,Annual_mean_temp,MeanDiurnalRange,Isothermality,TempSeasonality,MaxTempWarmMo,MinTempColdMo,TempAnnRange,MeanTempWetQtr,MeanTempDryQtr,MeanTempWarmQtr,MeanTempColdQtr,AnnPercip,PercipWetMo,PercipDryMo,PercipSeasonality,PrecipWetQtr,PrecipDryQtr,PrecipWrmQtr,PrecipColdQtr,class
0,10.486,9.234667,26.11909,977.9761,27.532,-7.824,35.356,21.88133,-1.892,22.15267,-1.892,1281,298,28,83.33064,703,93,696,93,1
1,11.59383,10.061,28.02195,973.2212,28.9,-7.004,35.904,23.23733,-0.714,23.23733,-0.714,1277,296,27,82.58533,697,94,697,94,1
2,15.76607,7.979762,26.54023,814.3849,31.47619,1.409524,30.06667,22.35794,6.014286,25.88492,6.014286,1328,214,37,49.53687,511,139,429,139,1
3,15.146,8.420667,27.26194,832.6071,31.272,0.384,30.888,21.99533,5.146,25.47533,5.146,1348,216,35,50.62211,519,134,447,134,1
4,15.3115,8.241,27.01967,823.9673,31.248,0.748,30.5,22.04267,5.428,25.54,5.428,1345,216,36,50.29088,518,136,442,136,1


In [5]:
import random
index_list = []
for i in range(0,2189):
    n = random.randint(0,1602327)
    index_list.append(n)

In [23]:
background = climate_df.loc[climate_df.index[index_list]]
background = background.drop(columns=['Latitude_rad', 'Longitude_rad', 'Latitude', 'Longitude'])
background["class"] = 0
background

Unnamed: 0,Annual_mean_temp,MeanDiurnalRange,Isothermality,TempSeasonality,MaxTempWarmMo,MinTempColdMo,TempAnnRange,MeanTempWetQtr,MeanTempDryQtr,MeanTempWarmQtr,MeanTempColdQtr,AnnPercip,PercipWetMo,PercipDryMo,PercipSeasonality,PrecipWetQtr,PrecipDryQtr,PrecipWrmQtr,PrecipColdQtr,class
750847,-8.399834,11.517670,30.18573,958.8948,10.760,-27.396,38.156,2.956000,-17.866000,3.089333,-20.249330,145,22,3,47.39283,56,18,54,23,0
1079535,27.100830,15.983670,41.49877,799.9277,44.616,6.100,38.516,33.838000,22.004000,35.729340,16.278670,99,29,1,95.15803,61,5,31,16,0
716897,13.489670,11.582000,28.46819,1105.5000,32.532,-8.152,40.684,26.440670,-0.945334,26.440670,-0.945334,521,165,3,119.07700,351,12,351,12,0
1225228,25.331000,9.804667,42.29066,415.1680,34.284,11.100,23.184,28.736670,20.002670,28.736670,19.183330,1674,347,5,93.63717,919,33,919,34,0
974329,25.393830,16.654330,41.78207,852.5541,43.708,3.848,39.860,33.140670,19.993330,34.606670,14.089330,184,52,2,98.80203,114,10,110,25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440443,5.911333,11.870670,23.96952,1406.0250,29.648,-19.876,49.524,22.565330,-12.255330,22.565330,-12.255330,104,29,1,101.19810,69,4,69,4,0
484569,4.297167,14.508330,33.80635,1043.1000,26.108,-16.808,42.916,3.630667,15.652670,16.380670,-9.079333,447,66,16,41.84678,176,62,91,88,0
253933,0.775667,12.611330,23.56730,1546.6020,25.200,-28.312,53.512,18.310670,-15.506000,18.310670,-19.957330,91,24,0,94.44779,60,2,60,2,0
1252835,21.281250,5.837500,29.10020,552.4530,30.790,10.730,20.060,27.301670,16.560000,27.525000,14.343330,1286,218,27,66.49835,596,83,494,171,0


In [None]:
df = pd.concat([joroUpdated, background])
df.to_csv("combined_joro_v2.csv", index=False)