In [3]:
# Install packages
%pip install kaggle boto3 python-dotenv
# Imports
import pandas as pd
import os
import kaggle
import zipfile
from mappings import state_province_to_country, country_full_names, country_code_mapping
import boto3
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
API_ACCESS_KEY=os.getenv("API_ACCESS_KEY")
API_SECRET_ACCESS_KEY=os.getenv("API_SECRET_ACCESS_KEY")

# Download and extract dataset
!kaggle datasets download -d camnugent/ufo-sightings-around-the-world

with zipfile.ZipFile('ufo-sightings-around-the-world.zip', 'r') as file:
    file.extractall()

ufo = pd.read_csv('ufo_sighting_data.csv')

ufo['Date_time'] = pd.to_datetime(ufo['Date_time'], format='%m/%d/%Y %H:%M', errors='coerce')
# Drop rows missing latitude or longitude values or date
ufo.dropna(subset=['Date_time', 'latitude', 'longitude'], inplace=True)

missing_country = ufo['country'].isnull().sum()
print(f"Number of missing values in 'country': {missing_country}")
# There are 9670 missing values in the country column.  
total_non_null = ufo['country'].count()
print(f"Total number of non-null vlaues in 'country': {total_non_null}")

# When I explored the data, I found a lot of country names missing in cases where the state or province is present. 
# Let's use state/province to fill in missing fields in the country column.
def fill_countries_from_state(ufo):
    for index, row in ufo.iterrows():
       if pd.isnull(row['country']):
        state_province = row['state/province']
        country = state_province_to_country.get(state_province, None)
        if country:
            ufo.at[index, 'country'] = country
    return ufo

# Check if there are any remaining missing values in the "country" column
ufo=fill_countries_from_state(ufo)
missing_country = ufo['country'].isnull().sum()
print(f"Number of missing values in 'country' after imputation: {missing_country}")


# Initialize AWS Location Service 
client = boto3.client('location',
                      region_name='us-east-1',
                      aws_access_key_id=API_ACCESS_KEY,
                      aws_secret_access_key=API_SECRET_ACCESS_KEY)

def get_full_country_name(lat, lon):
    try:
        print(f"Processing lat: {lat}, lon: {lon}")
        lat_float = float(lat)
        lon_float = float(lon)
        response = client.search_place_index_for_position(
            IndexName='UFO-Sightings-Place-Index',
            Position=[lon_float, lat_float]
        )
        for result in response.get('Results', []):
            country = result['Place'].get('Country')
            if country:
                full_country_name = country_full_names.get(country.lower(), country.lower())
                return full_country_name
        return None
    except Exception as e:
        print(f"Error for lat: {lat}, lon: {lon}. Error: {e}")
        return None
            
# Fill in missing "country" values based on latitude and longitude
ufo.loc[ufo['country'].isnull(), 'country'] = ufo[ufo['country'].isnull()].apply(
    lambda row: get_full_country_name(row['latitude'], row['longitude']), axis=1)

# Check if there are any remaining missing values in the "country" column
ufo['country'] = ufo['country'].replace(country_code_mapping)  
ufo['country'] = ufo['country'].replace(country_full_names)  
ufo['UFO_shape'].fillna('unknown', inplace=True) 
ufo.to_csv('ufo_sightings_filled.csv', index=False)

Note: you may need to restart the kernel to use updated packages.
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py", line 714, in urlopen
    httplib_response = self._make_request(
                       ^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py", line 403, in _make_request
    self._validate_conn(conn)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1053, in _validate_conn
    conn.connect()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connection.py", line 419, in connect
    self.sock = ssl_wrap_socket(
                ^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/util/ssl_.py", line 449, in ssl_wrap_socket


  ufo = pd.read_csv('ufo_sighting_data.csv')


Number of missing values in 'country': 9533
Total number of non-null vlaues in 'country': 70105
Number of missing values in 'country' after imputation: 3328
Processing lat: 32.364167, lon: -64.678611
Processing lat: -38.662334, lon: 178.017649
Processing lat: 55.183333, lon: 59.65
Processing lat: -33.093373, lon: 151.588982
Processing lat: 53.070884, lon: -2.736506
Processing lat: 55.288131, lon: 23.957728
Processing lat: 59.900209, lon: 5.282347
Processing lat: 64.066667, lon: -21.95
Processing lat: 27.477936, lon: -99.549573
Processing lat: -27.597553, lon: 152.7455
Processing lat: 46.62794, lon: 14.30899
Processing lat: -27.088079, lon: 152.928057
Processing lat: 33.590355, lon: 130.401716
Processing lat: 14.058324, lon: 108.277199
Processing lat: -33.137551, lon: 81.826172
Processing lat: 39.92077, lon: 32.85411
Processing lat: 51.312374, lon: 3.132768
Processing lat: 31.84694, lon: 120.872862
Processing lat: -37.653829, lon: 145.517006
Processing lat: 22.396428, lon: 114.109497
Pr