<a href="https://colab.research.google.com/github/michal-g/Notebooks-to-Packages/blob/main/predicting-ufo-sightings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We start by scraping the datasets from the sightings reports website. The reports portal contains a link to a table for each historical month

In [None]:
import itertools
import re
import requests
from bs4 import BeautifulSoup


base_url = 'https://nuforc.org/webreports'
grab = requests.get('/'.join([base_url, 'ndxevent.html']))
soup = BeautifulSoup(grab.text, 'html.parser')

sightings = []
col_labels = ['Date', 'City', 'Region', 'Country', 'Shape', 'Duration',
              'Summary', 'Posted', 'Images']

for link in soup('a', string=re.compile("[0-9]{2}\/2000")):
  data = link.get('href')
  grab_date = requests.get('/'.join([base_url, data]))
  date_soup = BeautifulSoup(grab_date.text, 'html.parser')

  for row in date_soup('tr'):
    cols = row.find_all('td')

    if cols:
      cur_sighting = None

      for lbl, col in zip(itertools.cycle(col_labels), cols):
        if lbl == 'Date':
          if cur_sighting is not None:
            sightings.append(cur_sighting)

          cur_sighting = {'Date': col.string}

        else:
          cur_sighting[lbl] = col.string

      if cur_sighting is not None:
        sightings.append(cur_sighting)


In [None]:
import pandas as pd


valid_states = {
    'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
    'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN',
    'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
    'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
    'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'
    }

sights_df = pd.DataFrame(sightings)
sights_df = sights_df.loc[(sights_df.Country == 'USA')
                          & sights_df.Region.isin(valid_states), :]
sights_df['Date'] = pd.to_datetime([dt.split()[0] for dt in sights_df['Date']],
                                   format='%m/%d/%y')

print(sights_df)


In [None]:
import plotly.express as px


counts = sights_df.groupby('Region').size()

fig = px.choropleth(locations=[str(x) for x in counts.index],
                    locationmode="USA-states",
                    color=counts.values, range_color=[0, counts.max()],
                    scope="usa",
                    color_continuous_scale=['white', 'red'])
fig.show()


In [None]:
import imageio
from IPython.display import Image
from pathlib import Path


counts = sights_df.groupby(['Date', 'Region']).size()
plt_files = list()
!mkdir -p map-plots

for dt, dt_counts in counts.groupby('Date'):
    date_lbl = dt.strftime('%F')
    state_locs = [str(x) for x in dt_counts.index.get_level_values('Region')]

    fig = px.choropleth(locations=state_locs, locationmode="USA-states",
                        title=date_lbl, scope='usa',
                        color=dt_counts.values, range_color=[0, 100],
                        color_continuous_scale=['white', 'black'])

    plt_file = Path("map-plots", f"counts_{date_lbl}.png")
    fig.write_image(plt_file, format='png')
    plt_files += [imageio.v2.imread(plt_file)]

imageio.mimsave(Path("map-plots", "counts.gif"), plt_files, duration=0.03)
Image(filename=str(Path("map-plots", "counts.gif")))


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (14, 9)

import numpy as np

from skits.preprocessing import ReversibleImputer
from skits.pipeline import ForecasterPipeline
from skits.feature_extraction import (AutoregressiveTransformer,
                                      SeasonalTransformer)

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import (RandomForestClassifier,
                              GradientBoostingClassifier,
                              RandomForestRegressor)

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit


pipeline = ForecasterPipeline([
    ('pre_scaler', StandardScaler()),
    ('features', FeatureUnion([
        ('ar_features', AutoregressiveTransformer(num_lags=1)),
        ('seasonal_features', SeasonalTransformer(seasonal_period=1)),
    ])),
    ('post_feature_imputer', ReversibleImputer()),
    ('post_feature_scaler', StandardScaler()),
    ('regressor', LinearRegression(fit_intercept=True))
    ])

tscv = TimeSeriesSplit(n_splits=4)
ca_counts = counts.loc[(slice(None), 'CA')]
ca_dates = ca_counts.index.get_level_values('Date').values.reshape(-1, 1)
ca_values = ca_counts.values

real_values = list()
pred_values = list()

for train_index, test_index in tscv.split(ca_counts):
    pipeline.fit(ca_dates[train_index], ca_values[train_index])
    preds = pipeline.predict(ca_dates[test_index], to_scale=True)

    real_values += ca_values[test_index].flatten().tolist()
    pred_values += preds.flatten().tolist()

    plt.plot(ca_dates[test_index], ca_values[test_index], color='black')
    plt.plot(ca_dates[test_index], preds, color='red')

rmse_val = ((np.array(real_values) - np.array(pred_values)) ** 2).mean() ** 0.5
print(f"RMSE: {format(rmse_val, '.3f')}")
