In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import json
import pyproj
from shapely.geometry import Point, Polygon

In [2]:
filename = "client_data.json"

In [3]:
with open(filename, 'r') as file:
    client_data = json.load(file)

In [4]:
df_clients = pd.DataFrame(client_data)

In [5]:
df_clients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1000 non-null   object
 1   name               1000 non-null   object
 2   email              1000 non-null   object
 3   phone              1000 non-null   object
 4   address            1000 non-null   object
 5   city               1000 non-null   object
 6   neighborhood       1000 non-null   object
 7   country            1000 non-null   object
 8   date_of_birth      1000 non-null   object
 9   gender             1000 non-null   object
 10  occupation         1000 non-null   object
 11  company            1000 non-null   object
 12  website            1000 non-null   object
 13  registration_date  1000 non-null   object
 14  last_login         1000 non-null   object
 15  customer_since     1000 non-null   object
dtypes: object(16)
memory usage: 125.1+ KB


In [6]:
df_distritos = gpd.read_file('Distritos/Distritos.shp')

In [7]:
df_distritos.head()

Unnamed: 0,Shape_Leng,COD_DIS,COD_DIS_TX,NOMBRE,DISTRI_MAY,DISTRI_MT,geometry
0,0.0,1,1,Centro,CENTRO,CENTRO,"POLYGON ((441184.784 4473200.780, 441181.244 4..."
1,0.0,2,2,Arganzuela,ARGANZUELA,ARGANZUELA,"POLYGON ((440272.919 4472969.567, 440350.318 4..."
2,0.0,3,3,Retiro,RETIRO,RETIRO,"POLYGON ((443652.149 4473312.600, 443560.739 4..."
3,0.0,4,4,Salamanca,SALAMANCA,SALAMANCA,"POLYGON ((444075.178 4476611.827, 444057.808 4..."
4,0.0,5,5,Chamartín,CHAMARTIN,CHAMARTÍN,"POLYGON ((442904.114 4481525.212, 442901.020 4..."


In [58]:
df_distritos.crs

<Projected CRS: EPSG:25830>
Name: ETRS89 / UTM zone 30N
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Europe between 6°W and 0°W: Faroe Islands offshore; Ireland - offshore; Jan Mayen - offshore; Norway including Svalbard - offshore; Spain - onshore and offshore.
- bounds: (-6.0, 35.26, 0.01, 80.49)
Coordinate Operation:
- name: UTM zone 30N
- method: Transverse Mercator
Datum: European Terrestrial Reference System 1989 ensemble
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [56]:
print(df_distritos.iloc[0,-1])

POLYGON ((441184.78419999965 4473200.7798999995, 441181.2441999996 4473184.229900001, 441178.8841000004 4473160.9098000005, 441140.8543999996 4473176.9202, 441126.35450000037 4473191.3303, 441121.81460000016 4473194.600400001, 441116.50459999964 4473195.610400001, 441104.6847000001 4473193.340500001, 441003.9352000002 4473158.5813, 440993.6052000001 4473163.6214000005, 440783.9763000002 4473093.142899999, 440778.21630000044 4473091.0030000005, 440656.12689999957 4473047.614, 440596.2271999996 4473026.5044, 440519.37760000024 4473001.3049, 440517.9288999997 4473000.7947, 440385.94820000045 4472954.316, 440350.3184000002 4472954.0163, 440272.91889999993 4472969.5669, 440050.9003999997 4473024.6888, 440000.9519999996 4473038.5801, 439890.40139999986 4473069.3301, 439794.2921000002 4473093.231000001, 439762.43230000045 4473099.351299999, 439646.44299999997 4473126.032299999, 439613.03330000024 4473137.672599999, 439581.69350000005 4473146.7828, 439442.74450000003 4473225.554099999, 439312.

In [60]:
original_crs = 'EPSG:25830'
target_crs = 'EPSG:4326'

In [62]:
df_distritos = df_distritos.to_crs(target_crs)

In [63]:
df_freq = (
    df_clients
    .groupby("neighborhood", as_index=False)
    .agg({"id": "count"})
    .rename(columns={
        "neighborhood": "NOMBRE", "id": "num_clientes"
    })
    .merge(df_distritos, on=["NOMBRE"])
)

In [64]:
df_freq["num_clientes"].sum()

1000

In [95]:
def generate_points_within_polygon(polygon, num_points):
    minx, miny, maxx, maxy = polygon.bounds
    points = []
    while len(points) < num_points:
        x = np.random.uniform(minx, maxx)
        y = np.random.uniform(miny, maxy)
        point = Point(x, y)
        if polygon.contains(point):
            points.append((y, x))
    return points

In [96]:
df_freq["points"] = df_freq.apply(lambda x: generate_points_within_polygon(x["geometry"], x["num_clientes"]), axis=1)

In [125]:
df_explode = df_freq.explode("points")
df_explode

Unnamed: 0,NOMBRE,num_clientes,Shape_Leng,COD_DIS,COD_DIS_TX,DISTRI_MAY,DISTRI_MT,geometry,points
0,Arganzuela,36,0.0,2,02,ARGANZUELA,ARGANZUELA,"POLYGON ((-3.70389 40.40520, -3.70297 40.40506...","(40.403096403315075, -3.6944630183803495)"
0,Arganzuela,36,0.0,2,02,ARGANZUELA,ARGANZUELA,"POLYGON ((-3.70389 40.40520, -3.70297 40.40506...","(40.39262784617288, -3.6866321231431445)"
0,Arganzuela,36,0.0,2,02,ARGANZUELA,ARGANZUELA,"POLYGON ((-3.70389 40.40520, -3.70297 40.40506...","(40.39739032269834, -3.6785676909074594)"
0,Arganzuela,36,0.0,2,02,ARGANZUELA,ARGANZUELA,"POLYGON ((-3.70389 40.40520, -3.70297 40.40506...","(40.39623069062191, -3.687501478420444)"
0,Arganzuela,36,0.0,2,02,ARGANZUELA,ARGANZUELA,"POLYGON ((-3.70389 40.40520, -3.70297 40.40506...","(40.40016093985922, -3.7112325145509453)"
...,...,...,...,...,...,...,...,...,...
20,Villaverde,50,0.0,17,17,VILLAVERDE,VILLAVERDE,"POLYGON ((-3.70366 40.36356, -3.70324 40.36356...","(40.34763586084796, -3.673041875362317)"
20,Villaverde,50,0.0,17,17,VILLAVERDE,VILLAVERDE,"POLYGON ((-3.70366 40.36356, -3.70324 40.36356...","(40.32630844405838, -3.6812612545515133)"
20,Villaverde,50,0.0,17,17,VILLAVERDE,VILLAVERDE,"POLYGON ((-3.70366 40.36356, -3.70324 40.36356...","(40.33338336342195, -3.6963161688237127)"
20,Villaverde,50,0.0,17,17,VILLAVERDE,VILLAVERDE,"POLYGON ((-3.70366 40.36356, -3.70324 40.36356...","(40.33644238195856, -3.668359544181401)"


In [98]:
from geopy.geocoders import Nominatim
from shapely.geometry import Point


In [146]:
# Initialize a geocoder (Nominatim in this case)
geolocator = Nominatim(user_agent="geo_reverse")

# Function to get address from coordinates
def get_address(point):
    #x,y = point
    #print(point)
    address = geolocator.reverse(point, timeout=10).raw["address"]  # Reverse geocoding with latitude and longitude
    print(address)
    return (
        address.get("road", ""), address.get("house_number", ""), address.get("suburb", ""), address.get("state", ""), address.get("postcode", ""), address.get("country", "")
    )

In [138]:
df_freq.iloc[0,-1][0]

(40.403096403315075, -3.6944630183803495)

In [139]:

point = df_freq.iloc[0,-1][0]
point

(40.403096403315075, -3.6944630183803495)

In [130]:
loc = geolocator.reverse(point, timeout=10) # Lat, Long to reverse geocode # (40.7484284, -73.9856546198733)

In [145]:
loc.raw["address"].get("road", "")

'Calle de Palos de la Frontera'

In [147]:
get_address(point)

{'house_number': '29', 'road': 'Calle de Palos de la Frontera', 'quarter': 'Palos de Moguer', 'suburb': 'Arganzuela', 'city': 'Madrid', 'state': 'Comunidad de Madrid', 'ISO3166-2-lvl4': 'ES-MD', 'postcode': '28045', 'country': 'España', 'country_code': 'es'}


('Calle de Palos de la Frontera',
 '29',
 'Arganzuela',
 'Comunidad de Madrid',
 '28045',
 'España')

In [None]:
df_explode

In [148]:
df_explode.apply(lambda x: get_address(x["points"]), axis=1)

{'house_number': '29', 'road': 'Calle de Palos de la Frontera', 'quarter': 'Palos de Moguer', 'suburb': 'Arganzuela', 'city': 'Madrid', 'state': 'Comunidad de Madrid', 'ISO3166-2-lvl4': 'ES-MD', 'postcode': '28045', 'country': 'España', 'country_code': 'es'}
{'road': 'Calle de la Antracita', 'quarter': 'Legazpi', 'suburb': 'Arganzuela', 'city': 'Madrid', 'state': 'Comunidad de Madrid', 'ISO3166-2-lvl4': 'ES-MD', 'postcode': '28045', 'country': 'España', 'country_code': 'es'}
{'road': 'Calle del Almez', 'quarter': 'Atocha', 'suburb': 'Arganzuela', 'city': 'Madrid', 'state': 'Comunidad de Madrid', 'ISO3166-2-lvl4': 'ES-MD', 'postcode': '28045', 'country': 'España', 'country_code': 'es'}
{'road': 'Calle Nebulosas', 'quarter': 'Delicias', 'suburb': 'Arganzuela', 'city': 'Madrid', 'state': 'Comunidad de Madrid', 'ISO3166-2-lvl4': 'ES-MD', 'postcode': '28045', 'country': 'España', 'country_code': 'es'}
{'house_number': '11', 'road': 'Paseo de Yeserías', 'quarter': 'Las Acacias', 'suburb': 'A

KeyboardInterrupt: 