In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [55]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# UFO Sightings Dataset

In [3]:
ufo = pd.read_csv('./ufo_sightings_large.csv')
print(ufo.shape)
ufo.head(3)

(4935, 11)


Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,11/3/2011 19:21,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,10/3/2004 19:05,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,9/25/2009 21:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875


In [4]:
ufo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            4935 non-null   object 
 1   city            4926 non-null   object 
 2   state           4516 non-null   object 
 3   country         4255 non-null   object 
 4   type            4776 non-null   object 
 5   seconds         4935 non-null   float64
 6   length_of_time  4792 non-null   object 
 7   desc            4932 non-null   object 
 8   recorded        4935 non-null   object 
 9   lat             4935 non-null   object 
 10  long            4935 non-null   float64
dtypes: float64(2), object(9)
memory usage: 424.2+ KB


## Checking column types

In [6]:
# change the type of seconds to float
ufo['seconds'] = ufo['seconds'].astype('float')

# change the date column to type datetime
ufo['date'] = pd.to_datetime(ufo['date'])

# check the column types
print(ufo[['seconds', 'date']].dtypes)

seconds           float64
date       datetime64[ns]
dtype: object


In [8]:
ufo.head(3)

Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,2009-09-25 21:00:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875


## Dropping missing data

In [12]:
# check how many values are missing in length_of_time,
# state, and type columns
print(ufo[['length_of_time', 'state', 'type']].isna().sum())

# keep only rows where said columns are not null
ufo_no_missing = ufo[ufo['length_of_time'].notna() &
                     ufo['state'].notna() &
                     ufo['type'].notna()]

# print out the shape of new dataset
print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


# UFO Sample Data

In [15]:
ufo_sample = pd.read_csv('./ufo_sample.csv')
print(ufo_sample.shape)
ufo.head(3)

(1866, 11)


Unnamed: 0,date,city,state,country,type,seconds,length_of_time,desc,recorded,lat,long
0,2011-11-03 19:21:00,woodville,wi,us,unknown,1209600.0,2 weeks,Red blinking objects similar to airplanes or s...,12/12/2011,44.9530556,-92.291111
1,2004-10-03 19:05:00,cleveland,oh,us,circle,30.0,30sec.,Many fighter jets flying towards UFO,10/27/2004,41.4994444,-81.695556
2,2009-09-25 21:00:00,coon rapids,mn,us,cigar,0.0,,Green&#44 red&#44 and blue pulses of light tha...,12/12/2009,45.12,-93.2875


In [16]:
ufo_sample_ = ufo_sample.copy()

In [19]:
def return_minutes(time_string):
    # use \d+ to grab digits
    pattern = re.compile(r'\d+')

    # use match on the patter and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))


# apply the extraction to the length_of_time column
ufo_sample_['minutes'] = ufo_sample_['length_of_time'].apply(lambda row: return_minutes(row))

ufo_sample_[['length_of_time', 'minutes']].head()

Unnamed: 0,length_of_time,minutes
0,about 5 minutes,
1,10 minutes,10.0
2,2 minutes,2.0
3,2 minutes,2.0
4,5 minutes,5.0


## Identifying features for standardization

In [21]:
# check the variance of the seconds and minutes columns
print(ufo_sample_[['seconds', 'minutes']].var())

# log normalize the seconds column
ufo_sample_['seconds_log'] = np.log(ufo_sample_['seconds'])

# print out the variance of just the seconds_log column
print(ufo_sample_['seconds_log'].var())

seconds    424087.417474
minutes       117.546372
dtype: float64
1.1223923881183004


In [22]:
ufo_sample_.shape

(1866, 13)

## Encoding categorical variables

In [24]:
# use pandas to encode us values as 1 and others as 0
ufo_sample_['country_enc'] = ufo_sample_['country'].apply(lambda x: 1 if x=='us' else 0)

# print the number of unique type values
print(len(ufo_sample_['type'].unique()))

# create a one-hot encoded set of type values
type_set = pd.get_dummies(ufo_sample_['type'])

# concatenate this set back to ufo_sample_
ufo_sample_ = pd.concat([ufo_sample_, type_set], axis=1)

21


In [25]:
ufo_sample_.shape

(1866, 35)

## Features from dates

In [27]:
ufo_sample_['date'] = pd.to_datetime(ufo_sample_['date'])

In [33]:
# look at the first 5 rows of the date column
print(ufo_sample_['date'].head())

# extract month from date
ufo_sample_['month'] = ufo_sample_['date'].apply(lambda row: row.month)

# extract year from date
ufo_sample_['year'] = ufo_sample_['date'].apply(lambda row: row.year)

# take a look at the head of all three columns
print(ufo_sample_[['date', 'month', 'year']].head())

0   2002-11-21 05:45:00
1   2012-06-16 23:00:00
2   2013-06-09 00:00:00
3   2013-04-26 23:27:00
4   2013-09-13 20:30:00
Name: date, dtype: datetime64[ns]
                 date  month  year
0 2002-11-21 05:45:00     11  2002
1 2012-06-16 23:00:00      6  2012
2 2013-06-09 00:00:00      6  2013
3 2013-04-26 23:27:00      4  2013
4 2013-09-13 20:30:00      9  2013


## Text vectorization

In [39]:
# take a look at the head of the desc field
print(ufo_sample_['desc'].head())

# create a tfidf vectorizer
vec = TfidfVectorizer()

desc_tfidf = vec.fit_transform(ufo_sample_['desc'])

#look at the number of columns this creates
print(desc_tfidf.shape)

0    It was a large&#44 triangular shaped flying ob...
1    Dancing lights that would fly around and then ...
2    Brilliant orange light or chinese lantern at o...
3    Bright red light moving north to north west fr...
4    North-east moving south-west. First 7 or so li...
Name: desc, dtype: object
(1866, 3422)


## Selecting the ideal dataset

In [44]:
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices,
                      vector[vector_index].data))
    # transform zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    
    return [original_vocab[i] for i in zipped_index]

def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
    # Here we'll call return_weights and extend the list we're creating
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    
    # return the list in a set so we don't get duplicate word indices
    return set(filter_list)

In [45]:
vocab_ufo = pd.read_csv('./vocab_ufo.csv', index_col=0)
vocab_ufo = vocab_ufo.to_dict()['0']

In [47]:
# check the correlation between seconds, seconds_log, minutes
print(ufo_sample_[['seconds', 'seconds_log', 'minutes']].corr())

# make a list of features to drop
to_drop = ['city', 'country', 'lat', 'long', 'state', # location-related besides country_enc
           'seconds', 'minutes', 'length_of_time',    # keep seconds_log
           'date', 'recorded',                        # month, year kept
           'desc']                                    # vectorized

ufo_dropped = ufo_sample_.drop(to_drop, axis=1)
print(ufo_dropped.shape)

# also filter some words out of the text vector
filtered_words = words_to_filter(vocab_ufo, 
                                 vec.vocabulary_,
                                 desc_tfidf,
                                 top_n=4)

              seconds  seconds_log   minutes
seconds      1.000000     0.853371  0.980341
seconds_log  0.853371     1.000000  0.824493
minutes      0.980341     0.824493  1.000000
(1866, 26)


## Modeling the UFO dataset

In [58]:
X = ufo_dropped.drop(['type', 'country_enc'], axis=1) # ignore type for now
y = ufo_dropped['country_enc']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    random_state=42)

# fit to a knn
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

print(f'test acc: {knn.score(X_test, y_test): .1%}')

test acc:  87.6%


In [60]:
# now, use the tfidf to predict the type of sighting

# use the list of filtered words to filter the text vector
filtered_text = desc_tfidf[:, list(filtered_words)]

X_train, X_test, y_train, y_test = train_test_split(filtered_text.toarray(), y,
                                                    stratify=y,
                                                    random_state=42)

# fit a naive bayes
nb = GaussianNB()
nb.fit(X_train, y_train)

print(f'test acc: {nb.score(X_test, y_test): .1%}')

test acc:  73.2%


The model, in this case, performs poorly. This may be a case where iteration would be necessary to figure out what subset of text improves the model, and if perhaps any of the other features are useful in predicting `type`.