### Import required libraries

In [None]:
import sqlite3 as db
import os
import re
import xml.etree.ElementTree as et
import pandas as pd

### Load database with names of companies and addresses

In [None]:
conn = db.connect('database.db')
df = pd.read_sql_query('select * from dataset1', conn)
conn.close()

## Geocoding with Nominatim: geocoding in subsets of 100 companies to handle large quantities of data

In [None]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="myApp")

# Define the starting row and subset size
start_row = 0
subset_size = 100
total_rows = len(df)

# Loop through the DataFrame in subsets
for start in range(start_row, total_rows, subset_size):
    end = min(start + subset_size, total_rows)
    df_subset = df.iloc[start:end].copy()

    # Add new columns for latitude, longitude, and full address
    df_subset['location_lat'] = None
    df_subset['location_long'] = None
    df_subset['location_address'] = None

    for i in df_subset.index:
        try:
            location = geolocator.geocode(df_subset.loc[i, 'Address'])
            if location:
                df_subset.loc[i, 'location_lat'] = location.latitude
                df_subset.loc[i, 'location_long'] = location.longitude
                df_subset.loc[i, 'location_address'] = location.address
        except:
            df_subset.loc[i, 'location_lat'] = None
            df_subset.loc[i, 'location_long'] = None
            df_subset.loc[i, 'location_address'] = None

    # Connect to your SQLite database 
    con = db.connect('database.db')
    df_subset.to_sql("dataset1_locations", con, if_exists="append", index=False)
    con.commit()
    con.close()

    print(f"Subset of rows {start} to {end} has been covered.")

## Now geocode addresses not found with Nominatim by using the Google Maps API

### First, data is loaded from the dataset and the companies for which coordinates were not found are selected

In [None]:
con = db.connect('database.db')

# Load the table data into a pandas DataFrame
df = pd.read_sql_query('SELECT * FROM dataset1_locations', con)


# Commit the changes and close the connection
con.commit()
con.close()

In [None]:
# Create a new DataFrame with rows where 'Address' is NaN
df_with_nan_location = df[df['location_address'].isna()]
df_with_nan_location

In [None]:
import googlemaps
from datetime import datetime

# Initialize the Google Maps client with your API key
gmaps = googlemaps.Client(key='Your_Google_Maps_API_Key')

# Function to geocode an address and return location details using Google Maps API
def geocode_address_google(address):
    try:
        # Geocode the address
        geocode_result = gmaps.geocode(address)
        
        if geocode_result:
            # Extract latitude and longitude
            location = geocode_result[0]['geometry']['location']
            latitude = location['lat']
            longitude = location['lng']
            
            # Extract formatted address
            full_address = geocode_result[0]['formatted_address']
            
            # Extract postal code
            postal_code = None
            for component in geocode_result[0]['address_components']:
                if 'postal_code' in component['types']:
                    postal_code = component['long_name']
                    break
            
            return pd.Series([latitude, longitude, full_address, postal_code])
        else:
            print(f"No results found for address: {address}")
            return pd.Series([None, None, None, None])
    
    except Exception as e:
        print(f"Error geocoding address {address}: {e}")
        return pd.Series([None, None, None, None])

# Ensure you are working with a copy if necessary
df_google = df_with_nan_location.copy()
# Apply the geocoding function to each address in the DataFrame
df_google[['location_lat', 'location_long', 'location_address', 'zip']] = df_google['Address'].apply(geocode_address_google)

## Finally, concatenate both dataframes

In [None]:
merged_df = pd.concat([df, df_google])
df_cleaned = merged_df[pd.notna(merged_df['Postal_code'])]