In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
from geopy import distance

import functools
import time

RAD_OF_EARTH_IN_M = 6_371_000

# Timing Decorator
def timer(func):
    """Print the runtime of the decorated function"""
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.perf_counter()    # 1
        value = func(*args, **kwargs)
        end_time = time.perf_counter()      # 2
        run_time = end_time - start_time    # 3
        print(f"Finished {func.__name__!r} in {run_time:.4f} secs")
        return value
    return wrapper_timer

def generate_balltree(df):
    '''
        Generate Balltree
    '''
    # return  BallTree(df[['Latitude', 'Longitude']].values, metric=lambda u, v: distance.distance(u, v).miles)
    return  BallTree(df[['Latitude_rad', 'Longitude_rad']].values, metric='haversine')


@timer
def find_and_merge_matches(tree: BallTree, merge_into_df: pd.DataFrame, merge_from_df: pd.DataFrame):
    '''
        Find closest matches in merge_into_df to items in tree
        Return a combined DataFrame
    '''
    distances, indices = tree.query(merge_into_df[['latitude_rad', 'longitude_rad']].values, k = 1)
    indices = list(map(lambda index: index[0], indices))
    distances = np.array(list(map(lambda distance: distance[0], distances)))
    nearest_to_merge = merge_from_df.iloc[indices].reset_index(drop=True)
    new_df = pd.concat(
    [
        merge_into_df.reset_index(drop=True),
        nearest_to_merge,
        pd.Series(distances * RAD_OF_EARTH_IN_M, name='dist')
    ], 
    axis=1)

    return new_df

In [2]:
present_df = pd.read_csv('joro_observations.csv')
climate_df = pd.read_csv("asia_climate.csv")

# Creates new columns converting coordinate degrees to radians.
for column in present_df[["latitude", "longitude"]]:
    rad = np.deg2rad(present_df[column].values)
    present_df[f'{column}_rad'] = rad
for column in climate_df[["Latitude", "Longitude"]]:
    rad = np.deg2rad(climate_df[column].values)
    climate_df[f'{column}_rad'] = rad

print(present_df.head())
print(climate_df.head())

    latitude   longitude  latitude_rad  longitude_rad
0  36.742970  126.583464      0.641286       2.209298
1  36.408825  126.853895      0.635454       2.214018
2  34.690083  135.195511      0.605456       2.359607
3  34.685381  135.124958      0.605374       2.358375
4  34.668723  135.145654      0.605083       2.358737
   Longitude   Latitude  Annual_mean_temp  MeanDiurnalRange  Isothermality  \
0  52.354167  52.104167          4.850833         10.619667      23.523983   
1  52.395833  52.104167          4.814500         10.557000      23.451662   
2  52.437500  52.104167          4.785500         10.526333      23.435598   
3  52.479167  52.104167          4.821167         10.511667      23.377962   
4  52.520833  52.104167          4.833833         10.503000      23.398235   

   TempSeasonality  MaxTempWarmMo  MinTempColdMo  TempAnnRange  \
0      1305.529785      28.648001     -16.496000     45.144001   
1      1303.395264      28.516001     -16.500000     45.015999   
2      13

In [3]:
tree = generate_balltree(climate_df)

In [10]:
joroUpdated = find_and_merge_matches(tree, present_df, climate_df)
joroUpdated = joroUpdated \
    .drop(columns=['Latitude_rad', 'Longitude_rad', 'latitude_rad', 'longitude_rad', 'Latitude', 'Longitude', 'latitude', 'longitude', 'dist']) \
    #.rename(columns={
    #    'Latitude': 'lat_clim',
    #    'Longitude': 'long_clim',
    #    'latitude': 'lat_present',
    #    'longitude': 'long_present',
    #    'dist': 'dist_clim_join'
    #}) \

joroUpdated['class'] = 1

#joroUpdated.to_csv("combined_joro_v2.csv", index=False)
joroUpdated.head()


Finished 'find_and_merge_matches' in 0.2089 secs


Unnamed: 0,Annual_mean_temp,MeanDiurnalRange,Isothermality,TempSeasonality,MaxTempWarmMo,MinTempColdMo,TempAnnRange,MeanTempWetQtr,MeanTempDryQtr,MeanTempWarmQtr,MeanTempColdQtr,AnnPercip,PercipWetMo,PercipDryMo,PercipSeasonality,PrecipWetQtr,PrecipDryQtr,PrecipWrmQtr,PrecipColdQtr,class
0,10.486,9.234667,26.11909,977.9761,27.532,-7.824,35.356,21.88133,-1.892,22.15267,-1.892,1281,298,28,83.33064,703,93,696,93,1
1,11.59383,10.061,28.02195,973.2212,28.9,-7.004,35.904,23.23733,-0.714,23.23733,-0.714,1277,296,27,82.58533,697,94,697,94,1
2,15.76607,7.979762,26.54023,814.3849,31.47619,1.409524,30.06667,22.35794,6.014286,25.88492,6.014286,1328,214,37,49.53687,511,139,429,139,1
3,15.146,8.420667,27.26194,832.6071,31.272,0.384,30.888,21.99533,5.146,25.47533,5.146,1348,216,35,50.62211,519,134,447,134,1
4,15.3115,8.241,27.01967,823.9673,31.248,0.748,30.5,22.04267,5.428,25.54,5.428,1345,216,36,50.29088,518,136,442,136,1


In [24]:
import random
index_list = []
for i in range(0,30000):
    n = random.randint(0,1602327)
    index_list.append(n)

In [25]:
background = climate_df.loc[climate_df.index[index_list]]
background = background.drop(columns=['Latitude_rad', 'Longitude_rad', 'Latitude', 'Longitude'])
background["class"] = 0
background

Unnamed: 0,Annual_mean_temp,MeanDiurnalRange,Isothermality,TempSeasonality,MaxTempWarmMo,MinTempColdMo,TempAnnRange,MeanTempWetQtr,MeanTempDryQtr,MeanTempWarmQtr,MeanTempColdQtr,AnnPercip,PercipWetMo,PercipDryMo,PercipSeasonality,PrecipWetQtr,PrecipDryQtr,PrecipWrmQtr,PrecipColdQtr,class
484925,4.068334,12.502000,30.53439,1062.96100,23.720,-17.224,40.944000,16.48400,-9.525333,16.48400,-9.525333,178,39,1,93.32278,105,4,105,4,0
1280988,21.534330,7.369333,32.80508,565.41550,31.524,9.060,22.464000,24.89267,15.492670,27.59933,13.935330,1640,281,28,69.14117,746,111,709,143,0
149447,6.064500,11.889000,23.26068,1466.50800,32.452,-18.660,51.112000,-2.24600,-10.860000,23.57267,-12.178670,180,19,10,19.58374,56,35,45,44,0
1585677,24.811170,7.767000,75.96832,51.94984,30.028,19.804,10.224000,24.62733,24.809330,25.37067,24.050000,1994,262,72,37.55146,691,237,380,442,0
813735,-6.392167,13.768330,38.02567,846.74480,10.364,-25.844,36.208000,3.79000,-15.783330,3.79000,-16.823330,275,78,1,117.52650,192,3,192,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
831749,-8.208834,14.285670,35.42369,942.53750,11.556,-28.772,40.328000,3.44200,-18.311330,3.44200,-19.698670,73,23,0,114.76470,54,2,54,3,0
952797,1.360000,8.699333,28.30340,787.02410,15.988,-14.748,30.736000,10.35800,-8.442667,10.83133,-8.442667,395,96,2,106.65100,256,7,250,7,0
477421,6.005667,13.136670,27.31343,1302.48700,29.332,-18.764,48.096000,21.60600,-8.756001,21.60600,-10.394670,93,27,0,110.17270,67,2,67,2,0
1566809,27.015170,7.221667,81.03306,67.53962,31.820,22.908,8.912001,27.00733,26.060000,27.63333,26.060000,2962,309,178,15.58263,872,604,744,604,0


In [26]:
df = pd.concat([joroUpdated, background])
df.to_csv("combined_joro_unbalanced.csv", index=False)