In [417]:
import math
import geopy.distance
from dis import dis
import math
import geopandas as gpd
import numpy
from shapely import wkt
from shapely import wkb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim

%matplotlib inline

train_data = pd.read_csv('data/stores_train.csv')
test_data = pd.read_csv('data/stores_test.csv')
busstops = pd.read_csv('data/busstops_norway.csv')
grunnkrets_age = pd.read_csv('data/grunnkrets_age_distribution.csv')
grunnkrets_households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
grunnkrets_income = pd.read_csv('data/grunnkrets_income_households.csv')
grunnkrets_stripped = pd.read_csv('data/grunnkrets_norway_stripped.csv')
plaace_hierarchy = pd.read_csv('data/plaace_hierarchy.csv')

In [418]:
# Add municipality names
train_data = pd.merge(train_data, grunnkrets_stripped[['grunnkrets_id', 'municipality_name']], on='grunnkrets_id', how='left')
# we get a bunch of duplicates of store_ids...? Remove them.
train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

# For test
test_data = pd.merge(test_data, grunnkrets_stripped[['grunnkrets_id', 'municipality_name']], on='grunnkrets_id', how='left')
# we get a bunch of duplicates of store_ids...? Remove them.
test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')

In [419]:
def dist_to_all_km_centroids(lat, lon, df):

    # coordinates in radians
    lat1 = lat*math.pi/180
    lon1 = lon*math.pi/180
    lat2 = df['lat']*math.pi/180 # go through whole lat column
    lon2 = df['lon']*math.pi/180 # go through whole lon column

    # store original coordinates in new dataframe
    distances = pd.DataFrame()
    distances['grunnkrets_id'] = df['grunnkrets_id'].copy()
    distances['lat'] = df['lat'].copy()
    distances['lon'] = df['lon'].copy()

    # calculate cartesian coordinates
    R = 6371 # Earth radius in km
    df['x'] = R*np.cos(lat2)*np.cos(lon2)
    df['y'] = R*np.cos(lat2)*np.sin(lon2)
    df['z'] = R*np.sin(lat2)
    x1 = R*np.cos(lat1)*np.cos(lon1)
    y1 = R*np.cos(lat1)*np.sin(lon1)
    z1 = R*np.sin(lat1)

    # calculate distance, store as new column in the distances dataframe
    distances['dist'] = np.sqrt(np.square(df['x']-x1)+np.square(df['y']-y1)+np.square(df['z']-z1))

    return distances['dist'].squeeze()

In [420]:
train_data['grunnkrets_id'].isna().sum()


0

In [421]:
train_data['municipality_name'].isna().sum()

30

In [422]:
test_data['grunnkrets_id'].isna().sum()

0

In [423]:
test_data['municipality_name'].isna().sum()

27

In [424]:
# No missing grunnkrets_id but we have missing municipality_name

In [480]:
# Get new dataframe with all grunnkrets_ids and municipality names

gks_unik = train_data["grunnkrets_id"].unique()
gks = pd.DataFrame(gks_unik, columns=['grunnkrets_id'])
gks = pd.merge(gks, grunnkrets_stripped[['grunnkrets_id', 'municipality_name', 'geometry']], on='grunnkrets_id', how='left')
gks = gks.drop_duplicates(subset=['grunnkrets_id'], keep='first').reset_index(drop=True)

gks = gks[gks['geometry'].notna()]


#gks = gks.drop_duplicates(subset=['grunnkrets_id'], keep='first').reset_index(drop=True)

#geoms = grunnkrets_stripped[["grunnkrets_id","geometry"]]


#gks = pd.merge(gks, geoms[['grunnkrets_id', 'geometry']], on='grunnkrets_id', how='left')
#gks = gks.drop_duplicates(subset=['grunnkrets_id'], keep='first').reset_index(drop=True)

# Set geometry
gks['geometry'] = gpd.GeoSeries.from_wkt(gks['geometry'])

# geoseries s with the geometries for each gk
s = gpd.GeoSeries(gks['geometry'])

# Calculate centroids and extract lat and lon
c = gpd.GeoSeries(s.centroid)
centroids = gpd.GeoDataFrame({'centroids': c}, geometry='centroids')


# print(centroids.iloc[:].values[0].x)
# print(centroids.index.to_list())
#lon_values = [centroids.iloc[i].values[0].x for i in centroids.index]

#centroids = centroids.dropna()

#lon_values = [centroids.iloc[i].values[0].x for i in range(len(centroids))]
#print(len(lon_values))


centroids['lon'] = centroids.geometry.apply(lambda x: x.x)
centroids['lat'] = centroids.geometry.apply(lambda x: x.y)

gks = gks.join(centroids)

#Add centroid lat and lon to the gks dataframe

#For each gk with no municipality name, check closest store outside that gk that has municipality name
gk_nans = gks[gks['municipality_name'].isnull()].reset_index()

print(gk_nans)

# for index in range(len(gk_nans)):
#     dist_to_other_gk = dist_to_all_km_centroids(gk_nans._get_value(index, 'lat'), gk_nans._get_value(index, 'lon'), gks)
#     dist_to_other_gk = dist_to_other_gk.to_frame()
#     dist_to_other_gk = dist_to_other_gk[dist_to_other_gk.dist == 0]
#     index_min = dist_to_other_gk[['dist']].idxmin()
#     closest_gk = dist_to_other_gk.get_value(index_min, 'grunnkrets_id') # extract the closest grunnkrets_id
#     closest_munic = gks.loc[gks['grunnkrets_id'] == closest_gk, 'municipality_name'] # extract municipality name of closest gk
#     gk_nans.at[index, 'municipality_name'] = closest_munic # set municipality name of gk to this in gk_nans
    
# # Merge gk_nans to gks
# gks.combine_first(gk_nans)





Empty DataFrame
Columns: [index, grunnkrets_id, municipality_name, geometry, centroids, lon, lat]
Index: []


In [None]:
print(gks)

       grunnkrets_id municipality_name  \
0            6020303           Drammen   
2            3010306              Oslo   
4            6050102         Ringerike   
6           18040102              Bodø   
8           16017414         Trondheim   
...              ...               ...   
25594        1052205         Sarpsborg   
25602        2192009             Bærum   
25668       11210203              Time   
25672        7010802            Horten   
25686       19021302            Tromsø   

                                                geometry  
0      POLYGON ((10.20462 59.74478, 10.20312 59.74273...  
2      POLYGON ((10.73037 59.91072, 10.72976 59.91023...  
4      POLYGON ((10.26540 60.16392, 10.26564 60.16396...  
6      POLYGON ((14.38001 67.28524, 14.37985 67.28542...  
8      POLYGON ((10.37097 63.35793, 10.37059 63.35643...  
...                                                  ...  
25594  MULTIPOLYGON (((11.07936 59.29347, 11.07951 59...  
25602  POLYGON ((10.467