### Library importing

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

### Data train-test cleaned importing

In [5]:
train = pd.read_csv('./data/properties_colombia_train.csv')
test = pd.read_csv('./data/properties_colombia_test.csv')

In [6]:
train[['lat', 'lon','geometry']].isna().sum()

lat         49498
lon         49498
geometry        0
dtype: int64

### Calculating missing values of GPS coordinates with city names.

We have 25% of missing values, we can recover this 25% of lost data with partial GPS acuraccy by replacing NA with city location name

In [7]:
train[['lat', 'lon']].isna().sum()

lat    49498
lon    49498
dtype: int64

In [8]:
mask_l2 = train.loc[:,'l2'].notna()
mask_l3 = train.loc[:,'l3'].notna()
mask_l4 = train.loc[:,'l4'].notna()
mask_l5 = train.loc[:,'l5'].notna()
mask_l6 = train.loc[:,'l6'].notna()
mask_lat = train.loc[:,'lat'].isna()
mask_lon = train.loc[:,'lon'].isna()

train.loc[(mask_l2 | mask_l3 | mask_l4 | mask_l5 | mask_l6) & (mask_lat & mask_lon), ['l2','l3','l4','l5','l6','lat','lon']].count()

l2     49498
l3     49174
l4      2476
l5      2256
l6       351
lat        0
lon        0
dtype: int64

We are gonna retrieve information from this API https://positionstack.com/documentation

First we make a list of all locations in suitable format to make a GET request in API

In [17]:
l_all = train.loc[(mask_l2 | mask_l3 | mask_l4 | mask_l5 | mask_l6) & (mask_lat & mask_lon), ['l6','l5','l4','l3','l2']]
l_all['all'] = l_all.agg(lambda x: f"{x['l6']},{x['l5']},{x['l4']},{x['l3']},{x['l2']},Colombia", axis=1)
l_all['all'] = l_all['all'].str.replace('nan,','')
lista_locaciones = l_all['all'].unique().tolist()
print(len(lista_locaciones))
lista_locaciones

272


['Bucaramanga,Santander,Colombia',
 'Medellín,Antioquia,Colombia',
 'Barranquilla,Atlántico,Colombia',
 'Montería,Córdoba,Colombia',
 'Villa Maria,Suba,Zona Noroccidental,Bogotá D.C,Cundinamarca,Colombia',
 'Villavicencio,Meta,Colombia',
 'Cartago,Valle del Cauca,Colombia',
 'Bosa,Zona Suroccidental,Bogotá D.C,Cundinamarca,Colombia',
 'Cali,Valle del Cauca,Colombia',
 'Manizales,Caldas,Colombia',
 'Pereira,Risaralda,Colombia',
 'Santa Marta,Magdalena,Colombia',
 'La Ceja,Antioquia,Colombia',
 'Las Palmas,Barranquilla,Atlántico,Colombia',
 'Guarne,Antioquia,Colombia',
 'Palestina,Caldas,Colombia',
 'San Francisco,Cundinamarca,Colombia',
 'Popayán,Cauca,Colombia',
 'Rionegro,Antioquia,Colombia',
 'Neiva,Huila,Colombia',
 'Chía,Cundinamarca,Colombia',
 'Suba,Zona Noroccidental,Bogotá D.C,Cundinamarca,Colombia',
 'Piedecuesta,Santander,Colombia',
 'Tuluá,Valle del Cauca,Colombia',
 'Cartagena,Bolívar,Colombia',
 'Bello,Antioquia,Colombia',
 'Floridablanca,Santander,Colombia',
 'Barrios Uni

Now we can make a request to the API via Google MAPS API: https://developers.google.com/maps/documentation/geocoding/requests-geocoding

In [35]:
import requests

def get_lat_lon_google(address, access_key = 'AIzaSyC_fwkGp0clZQAUEthROhn1K-dO1vb6jOo', URL = "https://maps.googleapis.com/maps/api/geocode/json"):
      PARAMS = {'key': access_key, 'address': address}
      r = requests.get(url = URL, params = PARAMS)
      data = r.json()
      return data['results'][0]['geometry']['location']['lat'], data['results'][0]['geometry']['location']['lng']

latitudes = []
longitudes = []
for i in lista_locaciones:
      latitude, longitude = get_lat_lon_google(address=i)
      latitudes.append(latitude)
      longitudes.append(longitude)

locations_df_google = pd.DataFrame({'l_all':lista_locaciones, 'lat':latitudes, 'lon':longitudes})
locations_df_google.to_csv('./data/gps/locations_all_df_google.csv', index=False)
locations_df_google


Unnamed: 0,l_all,lat,lon
0,"Bucaramanga,Santander,Colombia",7.119349,-73.122742
1,"Medellín,Antioquia,Colombia",6.247638,-75.565815
2,"Barranquilla,Atlántico,Colombia",11.004107,-74.806981
3,"Montería,Córdoba,Colombia",8.750983,-75.878535
4,"Villa Maria,Suba,Zona Noroccidental,Bogotá D.C...",4.741605,-74.102707
...,...,...,...
267,"El Rosal,Cundinamarca,Colombia",4.852250,-74.264824
268,"Prado,Tolima,Colombia",3.749730,-74.927732
269,"Zona Franca,Fontibón,Zona Occidental,Bogotá D....",5.026003,-74.030012
270,"20 De Julio,Antonio Nariño,Zona Sur,Bogotá D.C...",5.026003,-74.030012


In [36]:
locations_df_google.sort_values(by='lon').head(20)

Unnamed: 0,l_all,lat,lon
116,"San Andrés,San Andrés Providencia y Santa Cata...",12.576855,-81.705052
166,"Acandí,Chocó,Colombia",8.510109,-77.27898
150,"Cauca,Colombia",2.704981,-76.825965
271,"Valle del Cauca,Colombia",3.800889,-76.641271
52,"Pance,Cali,Valle del Cauca,Colombia",3.32834,-76.63865
17,"Popayán,Cauca,Colombia",2.444814,-76.614739
144,"La Cumbre,Valle del Cauca,Colombia",3.650783,-76.569995
151,"Santa Isabel,Cali,Valle del Cauca,Colombia",3.425098,-76.545572
35,"Jamundí,Valle del Cauca,Colombia",3.26177,-76.540327
164,"Ciudad Jardín,Cali,Valle del Cauca,Colombia",3.364874,-76.537289


In [37]:
mapa = folium.Map(location=[6.243025, -75.577305], tiles='cartodbpositron', zoom_start=3)

# Add a heatmap to the base map
HeatMap(data=locations_df_google[['lat', 'lon']], radius=10).add_to(mapa)


for i in range(0,locations_df_google.shape[0]):
    folium.Circle(
        location=[locations_df_google.iloc[i]['lat'], locations_df_google.iloc[i]['lon']],
        radius=3,
        color='orange').add_to(mapa)

mapa

# GEOGRAPHICAL DESCRIPTION

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

In [52]:
train = pd.read_csv('./data/cleaned/train_cleaned_imputed.csv')

### CHECKING PRICES CLASSIFICATION IN MAP
We do it only with samples because is really computationally expensive

In [47]:
sample = train.sample(10000, random_state=999)

map_prices = folium.Map(location=[6.243025, -75.577305], tiles='cartodbpositron', zoom_start=6)

# Add a heatmap to the base map
HeatMap(data=sample[['lat', 'lon']], radius=10).add_to(map_prices)

def color_producer(val):
    if val == 0:
        return 'green'
    else:
        return 'red'

for i in range(0,sample.shape[0]):
    folium.Circle(
        location=[sample.iloc[i]['lat'], sample.iloc[i]['lon']],
        radius=3,
        color=color_producer(sample.iloc[i]['target'])).add_to(map_prices)

In [48]:
map_prices

## QUESTION: IS DISTANCE TO SHORE IMPORTANT FOR PRICE?

https://gis.stackexchange.com/questions/426828/find-minimum-distance-to-us-coastline-in-python

http://www.naturalearthdata.com/downloads/10m-physical-vectors/

With this code we are gonna calculate distance beetween two points in a sphere with the Haversine method:

https://www.geeksforgeeks.org/haversine-formula-to-find-distance-between-two-points-on-a-sphere/

First we download from http://www.naturalearthdata.com/downloads/10m-physical-vectors/ date from coastline from all the world. 

It is specifically divided by country so we can focus only on Colombia shores. 

When with CalculateCoastDistance we iterate over all values and get the distance in kilometers and save it in a dataframe and CSV for further analysis. 

It can take ~ 20-60 mins depending on CPU specs

### TRAIN SET

In [53]:
import geopandas as gpd
from shapely.geometry import Point

def min_distance(point, lines):
    return lines.distance(point).min()

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
colombia = world[world['name']=='Colombia'].dissolve(by='name')
coastline = gpd.clip(gpd.read_file('./coastline/ne_10m_coastline.shp'), colombia).to_crs('EPSG:3087')

def CalculateCoastDistance(lat, lon):
    points_df = gpd.GeoDataFrame({
        'geometry': [
            Point(lon, lat)]
            }, crs='EPSG:4326')
    points_df = points_df.to_crs('EPSG:3087') # https://epsg.io/3087

    points_df['min_dist_to_coast'] = points_df.geometry.apply(min_distance, args=(coastline,))
    return points_df['min_dist_to_coast']/1000

  clipped.loc[


In [50]:
train['coast_dist'] = train.apply(lambda x: CalculateCoastDistance(x['lat'], x['lon']), axis=1)

In [51]:
train['coast_dist'].to_csv('./data/distances_train.csv')

### TEST SET

In [54]:
test = pd.read_csv('./data/cleaned/test_cleaned_imputed.csv')

In [None]:
test['coast_dist'] = test.apply(lambda x: CalculateCoastDistance(x['lat'], x['lon']), axis=1)

In [None]:
test['coast_dist'].to_csv('./data/distances_test.csv')