## EDA on the UFO dataset

In [3]:
import pandas as pd
import os
from pathlib import Path

In [5]:
ufo = pd.read_csv(Path.joinpath(Path.cwd().parent, 'data', 'ufo.csv'))
ufo = pd.read_csv('../data/ufo.csv')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [6]:
# missing data?
ufo.isnull().sum()

City                  47
Colors Reported    63509
Shape Reported      8402
State                  0
Time                   0
dtype: int64

In [7]:
# how much?
ufo.shape

(80543, 5)

In [8]:
# columns names?
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')

In [9]:
# let's rename those cols
ufo.rename(columns={'Colors Reported':'colors_reported', 
                    'Shape Reported':'shape_reported'}, inplace=True)
ufo.head()

Unnamed: 0,City,colors_reported,shape_reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


## Using loc to select rows and columns

In [11]:
# select one column all rows
ufo.loc[:, 'City'].head()

0                  Ithaca
1             Willingboro
2                 Holyoke
3                 Abilene
4    New York Worlds Fair
Name: City, dtype: object

In [12]:
# select 2 columns
ufo.loc[:, ['City', 'State']].head()

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY


In [14]:
# select certain rows
ufo.loc[5:10, ['City', 'State']].head()

Unnamed: 0,City,State
5,Valley City,ND
6,Crater Lake,CA
7,Alma,MI
8,Eklutna,AK
9,Hubbard,OR


In [15]:
# .iloc method
ufo.iloc[5:10, 0:3].head()

Unnamed: 0,City,colors_reported,shape_reported
5,Valley City,,DISK
6,Crater Lake,,CIRCLE
7,Alma,,DISK
8,Eklutna,,CIGAR
9,Hubbard,,CYLINDER


## Date time features with pandas

In [17]:
# we have a date column
ufo.head(2)

Unnamed: 0,City,colors_reported,shape_reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00


In [19]:
# it's not actually a date format
type(ufo['Time'])
ufo['Time'].dtype

dtype('O')

In [20]:
# convert a string to a datetime
ufo['newtime'] = pd.to_datetime(ufo['Time'])

In [23]:
# compare
ufo[['Time', 'newtime']].head()
print(ufo['newtime'].dtype)

datetime64[ns]


In [24]:
# pandas can create a bunch of cool new variables from dates!!
ufo.head(3)

Unnamed: 0,City,colors_reported,shape_reported,State,Time,newtime
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,1930-06-01 22:00:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,1930-06-30 20:00:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00,1931-02-15 14:00:00


In [26]:
# year
ufo['Year']=ufo['newtime'].dt.year
ufo.head(3)

Unnamed: 0,City,colors_reported,shape_reported,State,Time,newtime,Year
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,1930-06-01 22:00:00,1930
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,1930-06-30 20:00:00,1930
2,Holyoke,,OVAL,CO,2/15/1931 14:00,1931-02-15 14:00:00,1931


In [34]:
# let's get some year stats
ufo.groupby(['Year'])['shape_reported'].count().max()

6679

In [35]:
# a few other vars
ufo['Month']=ufo['newtime'].dt.month
ufo.head()

Unnamed: 0,City,colors_reported,shape_reported,State,Time,newtime,Year,Month
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,1930-06-01 22:00:00,1930,6
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,1930-06-30 20:00:00,1930,6
2,Holyoke,,OVAL,CO,2/15/1931 14:00,1931-02-15 14:00:00,1931,2
3,Abilene,,DISK,KS,6/1/1931 13:00,1931-06-01 13:00:00,1931,6
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,1933-04-18 19:00:00,1933,4


In [38]:
ufo['Day']=ufo['newtime'].dt.day
ufo['weekday']=ufo['newtime'].dt.weekday
ufo['Hour']=ufo['newtime'].dt.hour
ufo.head()

Unnamed: 0,City,colors_reported,shape_reported,State,Time,newtime,Year,Month,Day,weekday,Hour
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,1930-06-01 22:00:00,1930,6,1,6,22
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,1930-06-30 20:00:00,1930,6,30,0,20
2,Holyoke,,OVAL,CO,2/15/1931 14:00,1931-02-15 14:00:00,1931,2,15,6,14
3,Abilene,,DISK,KS,6/1/1931 13:00,1931-06-01 13:00:00,1931,6,1,0,13
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,1933-04-18 19:00:00,1933,4,18,1,19


In [44]:
# what is the most common hour for seeing ufos?
ufo.groupby('Hour')['Time'].count().sort_index(ascending=False)

Hour
23     7821
22    10993
21    11837
20     8636
19     5976
18     3913
17     2512
16     1592
15     1397
14     1289
13     1311
12     1403
11     1146
10     1229
9      1031
8       825
7       913
6      1190
5      1657
4      1534
3      1981
2      2281
1      3146
0      4930
Name: Time, dtype: int64