In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from functools import lru_cache
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_column', 100)

In [2]:
data = pd.read_csv('../dataset/processed_data.csv')
data.head()

Unnamed: 0,HouseType,Price,Size,Bedrooms,Bathrooms,Level,Furnished,Area,City,ElectricityMeter,BuiltinKitchenAppliances,WaterMeter,Elevator,Balcony,NaturalGas,Landline,Security,CentralAC,CoveredParking,PetsAllowed,PrivateGarden,MaidsRoom,Pool
0,Apartment,35000,165,3,3,0,1,Madinaty,Cairo,1,1,1,0,1,1,0,1,1,0,0,1,0,0
1,Apartment,77000,225,3,3,2,1,Uptown Cairo,Cairo,0,0,0,0,1,0,0,1,0,1,1,0,1,0
2,Apartment,30000,280,4,3,1,0,Nasr City,Cairo,1,0,1,1,1,1,1,1,0,0,0,0,0,0
3,Apartment,1500,120,3,2,2,1,Nasr City,Cairo,1,1,1,0,1,1,0,0,1,0,0,0,0,0
4,Apartment,160000,14,2,3,1,1,Katameya Heights,Cairo,1,0,1,0,1,0,0,1,1,0,1,0,0,0


In [3]:
geolocator = Nominatim(user_agent="Python3.11.3")

def get_coord_lat_lon(full_addr: str):
    """ Get coordinates for address
        Remove brackets: "Mitte (Ortsteil), 10117" => "Mitte, 10117"
    """
    pt = geolocator.geocode(full_addr)
    return (pt.latitude, pt.longitude) if pt else (np.nan, np.nan)

In [4]:
# Create a lookup table to save time
cairoRegions = data.query('City == "Cairo"')['Area'].unique().tolist()
gizaRegions = data.query('City == "Giza"')['Area'].unique().tolist()

cairoCoords = {}
for region in cairoRegions:
    long, lat = get_coord_lat_lon(f'{region}, Cairo')
    cairoCoords[region] = [long, lat]
    
gizaCoords = {}
for region in gizaRegions:
    long, lat = get_coord_lat_lon(f'{region}, Giza')
    gizaCoords[region] = [long, lat]

In [5]:
cairoCoords

{'Madinaty': [30.093319, 31.63791604173324],
 'Uptown Cairo': [30.02362815, 31.304425051498075],
 'Nasr City': [30.0521177, 31.3422045],
 'Katameya Heights': [29.99539705, 31.40535759085526],
 'Fifth Square': [30.051085999999998, 31.537079494992852],
 'EL Shouyfat': [nan, nan],
 'Zahraa Al Maadi': [29.9647127, 31.2825293],
 'Heliopolis': [30.1005985, 31.3329136],
 'Rehab City': [30.063584, 31.4889939],
 'Villette': [nan, nan],
 'Zamalek': [30.055459550000002, 31.219631655798278],
 'Cairo Festival City': [30.026992800000002, 31.40906325945414],
 'Lake View': [30.0959915, 31.642971],
 'Eastown': [30.012330300000002, 31.51555386421819],
 'Mountain View Hyde Park': [29.98737805, 31.5554474],
 'Porto New Cairo': [30.0641413, 31.42413429130798],
 'El Banafseg 12': [30.0665034, 31.4872965],
 '1st Settlement': [30.0632671, 31.4544799],
 'Gardenia': [30.14120045, 31.649164616027797],
 'Village Gate': [14.550879, 121.1362882],
 'Downtown Cairo': [30.0887612, 31.2898195],
 'Lake view Residence': 

In [6]:
# Latitude Function
def region_geo_coder_latitude(region):
    if region in cairoRegions:
        return cairoCoords[region][0]
    
    if region in gizaRegions:
        return gizaCoords[region][0]
    
    if region in alexRegions:
        return alexCoords[region][0]

# Longtitude Function
def region_geo_coder_longtitude(region):
    if region in cairoRegions:
        return cairoCoords[region][1]
    
    if region in gizaRegions:
        return gizaCoords[region][1]
    
    if region in alexRegions:
        return alexCoords[region][1]


data['Lat'] = data['Area'].apply(region_geo_coder_latitude)
data['Long'] = data['Area'].apply(region_geo_coder_longtitude)

In [7]:
data.isna().sum()

HouseType                     0
Price                         0
Size                          0
Bedrooms                      0
Bathrooms                     0
Level                         0
Furnished                     0
Area                          0
City                          0
ElectricityMeter              0
BuiltinKitchenAppliances      0
WaterMeter                    0
Elevator                      0
Balcony                       0
NaturalGas                    0
Landline                      0
Security                      0
CentralAC                     0
CoveredParking                0
PetsAllowed                   0
PrivateGarden                 0
MaidsRoom                     0
Pool                          0
Lat                         655
Long                        655
dtype: int64

In [8]:
# drop the null values
data = data.dropna()

In [9]:
data.to_csv('../dataset/geocoded_data.csv', index=False)