In [2]:
## which state has the most sightings? over time?
## most frequent shape by state?
## timeline changes? trends?
## sighting density around airports/afb

In [3]:
import os
import pandas as pd
import json
import requests
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
from geopy.geocoders import Nominatim
from shapely.geometry import Point

In [4]:
ufo_df = pd.read_csv('resources/ufo.csv', low_memory=False)
airport_df = pd.read_csv('resources/airports.csv', low_memory=False)

In [5]:
nRow, nCol = ufo_df.shape
print(f'This csv has {nRow} rows and {nCol} columns')

This csv has 80332 rows and 11 columns


In [6]:
#split datetime into date and time and move them to the front of the df

ufo_df[['date', 'time']] = ufo_df.datetime.str.split(' ', expand=True)

first_column = ufo_df.pop('time')
ufo_df.insert(0, 'time', first_column)

first_column = ufo_df.pop('date')
ufo_df.insert(0, 'date', first_column)

ufo_df.pop('datetime')

ufo_df.head()

Unnamed: 0,date,time,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,11/11/1906,0:00,wien (austria),,,other,10800,3 h,The oldest professional photo of a UFO object ...,12/23/2002,48.208174,16.373819
1,3655,,kirksville (near),mo,us,disk,120,minutes,Historical sighting (1903 - 1913) Northern Mis...,9/15/2005,40.1947222,-92.583056
2,6/1/1910,15:00,wills point,tx,us,cigar,120,2 minutes,Cigar shaped object moving from West to East,4/16/2005,32.7091667,-96.008056
3,4/5/1916,13:00,france (above; from aircraft),,,cigar,60,about 1 min.,((NUFORC Note: Possible hoax. PD)) Saw 3 ci...,3/9/2004,46.227638,2.213749
4,6/11/1920,21:00,cicero,in,us,unknown,60,1 minute,((NUFORC Note: Probable hoax. Note date. PD...,5/12/2009,40.1238889,-86.013333


In [7]:
#convert 'duration' column from object to float64

ufo_df.rename(columns = {'duration (seconds)':'duration_sec'}, inplace = True)

ufo_df["duration_sec"] = pd.to_numeric(ufo_df.duration_sec, errors='coerce')

#convert 'latitude' and 'longitude' columns from object to float64
ufo_df["latitude"] = pd.to_numeric(ufo_df.latitude, errors='coerce')
ufo_df["longitude"] = pd.to_numeric(ufo_df.longitude, errors='coerce')

In [8]:
ufo_df.dtypes

date                     object
time                     object
city                     object
state                    object
country                  object
shape                    object
duration_sec            float64
duration (hours/min)     object
comments                 object
date posted              object
latitude                float64
longitude               float64
dtype: object

In [9]:
ufo_df['shape'].value_counts()

light        16565
triangle      7865
circle        7608
fireball      6208
other         5649
unknown       5584
sphere        5387
disk          5213
oval          3733
formation     2457
cigar         2057
changing      1962
flash         1328
rectangle     1297
cylinder      1283
diamond       1178
chevron        952
egg            759
teardrop       750
cone           316
cross          233
delta            7
round            2
crescent         2
dome             1
changed          1
pyramid          1
flare            1
hexagon          1
Name: shape, dtype: int64

In [10]:
ufo_df['state'].value_counts()

ca    9655
wa    4268
fl    4200
tx    3677
ny    3219
      ... 
nf      25
nt      20
pe      17
yt      13
yk       7
Name: state, Length: 67, dtype: int64

In [11]:
ufo_df['city'].value_counts()

seattle                  525
phoenix                  454
portland                 374
las vegas                368
los angeles              353
                        ... 
100 mile (canada)          1
leamington (canada)        1
lousa (portugal)           1
san bernardino county      1
inverloch (australia)      1
Name: city, Length: 19900, dtype: int64

In [12]:
airport_df.dtypes

id                     int64
ident                 object
type                  object
name                  object
latitude_deg         float64
longitude_deg        float64
coordinates           object
elevation_ft         float64
continent             object
country_name          object
iso_country           object
region_name           object
iso_region            object
local_region          object
municipality          object
scheduled_service      int64
gps_code              object
iata_code             object
local_code            object
home_link             object
wikipedia_link        object
keywords              object
score                  int64
last_updated          object
dtype: object

In [13]:
ufo_df=ufo_df.rename(columns = {'latitude':'lat','longitude':'lon'})
airport_df=airport_df.rename(columns = {'latitude_deg':'lat','longitude_deg':'lon'})
# To make sure that there are no null values and All are either integers/ Float values
ufo_df.info() 
print('\n XXXXXXXXXXXXXXXXXXXXXXX\n')
airport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80332 entries, 0 to 80331
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  80332 non-null  object 
 1   time                  79638 non-null  object 
 2   city                  80332 non-null  object 
 3   state                 74535 non-null  object 
 4   country               70662 non-null  object 
 5   shape                 78400 non-null  object 
 6   duration_sec          80329 non-null  float64
 7   duration (hours/min)  80332 non-null  object 
 8   comments              80317 non-null  object 
 9   date posted           80332 non-null  object 
 10  lat                   80331 non-null  float64
 11  lon                   80332 non-null  float64
dtypes: float64(3), object(9)
memory usage: 7.4+ MB

 XXXXXXXXXXXXXXXXXXXXXXX

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29084 entries, 0 to 29083
Data columns (total 24

In [14]:
def dist(lat1, long1, lat2, long2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lat1, long1, lat2, long2 = map(np.radians, [lat1, long1, lat2, long2])
    # haversine formula 
    dlon = long2 - long1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [15]:
def find_nearest(lat, long):
    distances = airport_df.apply(
        lambda row: dist(lat, long, row['lat'], row['lon']), 
        axis=1)
    return airport_df.loc[distances.idxmin(), 'name']

In [None]:
ufo_df['name'] = ufo_df.apply(
    lambda row: find_nearest(row['lat'], row['lon']), 
    axis=1)
ufo_df.head()

In [None]:
ufo_df = pd.merge(ufo_df,airport_df[['name','lat','lon']],on='name', how='left')
# Rename the new columns as both the columns has same name, and python gets confused 
ufo_df=ufo_df.rename(columns = {'lat_x':'m_lat','lon_x':'m_lon','lat_y':'h_lat','lon_y':'h_lon'})
ufo_df.head()