## Pandas for Exploratory Data Analysis

National UFO Reporting Center data:  
* main page: http://www.nuforc.org/webreports.html  
* file: ufo.csv

In [1]:
import pandas as pd
import os
from pathlib import Path

# Dataset of UFO sightings

In [2]:
# Find a path to our data dictionary and file
home = Path.cwd()
datadir = Path.joinpath(home.parent, 'data')
path2file = Path.joinpath(datadir, 'ufo.csv')

In [3]:
# Read in the dataframe
ufo = pd.read_csv(path2file)

In [4]:
# read ufo.csv into a DataFrame called 'ufo'
ufo_data_url = r'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/ufo.csv'
ufo = pd.read_csv(ufo_data_url)

In [5]:
# check the shape of the DataFrame
ufo.shape

(80543, 5)

In [6]:
# what are the three most common colors reported?
ufo['Colors Reported'].value_counts()[0:2]

ORANGE    5216
RED       4809
Name: Colors Reported, dtype: int64

In [7]:
# rename any columns with spaces so that they don't contain spaces
ufo.rename(columns={'Shape Reported':'Shape_Reported'}, inplace=True)
ufo.columns

Index(['City', 'Colors Reported', 'Shape_Reported', 'State', 'Time'], dtype='object')

In [8]:
ufo.columns = [col.replace(' ','_') for col in ufo.columns]
ufo.columns

Index(['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time'], dtype='object')

In [9]:
# for reports in VA, what's the most common city?
ufo[ufo['State'] == 'VA'].City.value_counts()[0:1]

Virginia Beach    110
Name: City, dtype: int64

In [10]:
# print a DataFrame containing only reports from Arlington, VA
ufo[(ufo['City'] == 'Arlington') & (ufo['State'] == 'VA')]

Unnamed: 0,City,Colors_Reported,Shape_Reported,State,Time
202,Arlington,GREEN,OVAL,VA,7/13/1952 21:00
6300,Arlington,,CHEVRON,VA,5/5/1990 21:40
10278,Arlington,,DISK,VA,5/27/1997 15:30
14527,Arlington,,OTHER,VA,9/10/1999 21:41
17984,Arlington,RED,DISK,VA,11/19/2000 22:00
21201,Arlington,GREEN,FIREBALL,VA,1/7/2002 17:45
22633,Arlington,,LIGHT,VA,7/26/2002 1:15
22780,Arlington,,LIGHT,VA,8/7/2002 21:00
25066,Arlington,,CIGAR,VA,6/1/2003 22:34
27398,Arlington,,VARIOUS,VA,12/13/2003 2:00


In [11]:
# count the number of missing values in each column
ufo.isnull().sum()

City                  47
Colors_Reported    63509
Shape_Reported      8402
State                  0
Time                   0
dtype: int64

In [12]:
# how many rows remain if you drop all rows with any missing values?
len(ufo.dropna())

15510

### Selecting Multiple Columns and Filtering Rows

#### select multiple columns

In [13]:
my_cols = ['City', 'State']     # create a list of column names...

In [14]:
ufo[my_cols]                    # ...and use that list to select columns

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY
5,Valley City,ND
6,Crater Lake,CA
7,Alma,MI
8,Eklutna,AK
9,Hubbard,OR


In [15]:
ufo[['City', 'State']]          # or, combine into a single step

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY
5,Valley City,ND
6,Crater Lake,CA
7,Alma,MI
8,Eklutna,AK
9,Hubbard,OR


#### use loc to select columns by name

In [16]:
ufo.loc[:, 'City']              # colon means "all rows", then select one column

0                      Ithaca
1                 Willingboro
2                     Holyoke
3                     Abilene
4        New York Worlds Fair
5                 Valley City
6                 Crater Lake
7                        Alma
8                     Eklutna
9                     Hubbard
10                    Fontana
11                   Waterloo
12                     Belton
13                     Keokuk
14                  Ludington
15                Forest Home
16                Los Angeles
17                  Hapeville
18                     Oneida
19                 Bering Sea
20                   Nebraska
21                        NaN
22                        NaN
23                  Owensboro
24                 Wilderness
25                  San Diego
26                 Wilderness
27                     Clovis
28                 Los Alamos
29               Ft. Duschene
                 ...         
80513              Manahawkin
80514             New Bedford
80515     

In [17]:
ufo.loc[:, ['City', 'State']]   # select two columns

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY
5,Valley City,ND
6,Crater Lake,CA
7,Alma,MI
8,Eklutna,AK
9,Hubbard,OR


In [18]:
ufo.loc[:, 'City':'State']      # select a range of columns

Unnamed: 0,City,Colors_Reported,Shape_Reported,State
0,Ithaca,,TRIANGLE,NY
1,Willingboro,,OTHER,NJ
2,Holyoke,,OVAL,CO
3,Abilene,,DISK,KS
4,New York Worlds Fair,,LIGHT,NY
5,Valley City,,DISK,ND
6,Crater Lake,,CIRCLE,CA
7,Alma,,DISK,MI
8,Eklutna,,CIGAR,AK
9,Hubbard,,CYLINDER,OR


#### loc can also filter rows by "name" (the index)

In [19]:
ufo.loc[0, :]                   # row 0, all columns

City                       Ithaca
Colors_Reported               NaN
Shape_Reported           TRIANGLE
State                          NY
Time               6/1/1930 22:00
Name: 0, dtype: object

In [20]:
ufo.loc[0:2, :]                 # rows 0/1/2, all columns

Unnamed: 0,City,Colors_Reported,Shape_Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00


In [21]:
ufo.loc[0:2, 'City':'State']    # rows 0/1/2, range of columns

Unnamed: 0,City,Colors_Reported,Shape_Reported,State
0,Ithaca,,TRIANGLE,NY
1,Willingboro,,OTHER,NJ
2,Holyoke,,OVAL,CO


#### use iloc to filter rows and select columns by integer position

In [22]:
ufo.iloc[:, [0, 3]]             # all rows, columns in position 0/3

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY
5,Valley City,ND
6,Crater Lake,CA
7,Alma,MI
8,Eklutna,AK
9,Hubbard,OR


In [23]:
ufo.iloc[:, 0:4]                # all rows, columns in position 0/1/2/3

Unnamed: 0,City,Colors_Reported,Shape_Reported,State
0,Ithaca,,TRIANGLE,NY
1,Willingboro,,OTHER,NJ
2,Holyoke,,OVAL,CO
3,Abilene,,DISK,KS
4,New York Worlds Fair,,LIGHT,NY
5,Valley City,,DISK,ND
6,Crater Lake,,CIRCLE,CA
7,Alma,,DISK,MI
8,Eklutna,,CIGAR,AK
9,Hubbard,,CYLINDER,OR


In [24]:
ufo.iloc[0:3, :]                # rows in position 0/1/2, all columns

Unnamed: 0,City,Colors_Reported,Shape_Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00


### Other Commonly Used Features

In [25]:
# replace all instances of a value in a column (must match entire value)
ufo.State.replace('Fl', 'FL', inplace=True)

In [26]:
# string methods are accessed via 'str'
ufo.State.str.upper()                               # converts to uppercase
ufo.head(2)

Unnamed: 0,City,Colors_Reported,Shape_Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00


In [35]:
ufo.Colors_Reported.str.contains('RED', na='False').sample(5) # checks for a substring

45491    False
33450    False
72880    False
69987    False
5124     False
Name: Colors_Reported, dtype: object

In [36]:
# convert a string to the datetime format
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo.head(2)

Unnamed: 0,Time,City,Colors_Reported,Shape_Reported,State
0,1930-06-01 22:00:00,Ithaca,,TRIANGLE,NY
1,1930-06-30 20:00:00,Willingboro,,OTHER,NJ


In [47]:
ufo['Year']=ufo['Time'].dt.year 
ufo['Month']=ufo['Time'].dt.month 
ufo['Day']=ufo['Time'].dt.day 
ufo['Hour']=ufo['Time'].dt.hour 
ufo['Weekday']=ufo['Time'].dt.weekday 
ufo.head()
# datetime format exposes convenient attributes

Unnamed: 0,Time,City,Colors_Reported,Shape_Reported,State,Year,Day,Month,Weekday,Hour
0,1930-06-01 22:00:00,Ithaca,,TRIANGLE,NY,1930,1,6,6,22
1,1930-06-30 20:00:00,Willingboro,,OTHER,NJ,1930,30,6,0,20
2,1931-02-15 14:00:00,Holyoke,,OVAL,CO,1931,15,2,6,14
3,1931-06-01 13:00:00,Abilene,,DISK,KS,1931,1,6,0,13
4,1933-04-18 19:00:00,New York Worlds Fair,,LIGHT,NY,1933,18,4,1,19


In [39]:
(ufo.Time.max() - ufo.Time.min()).days  # also allows you to do datetime "math"

30776

In [31]:
# setting and then removing an index
ufo.set_index('Time', inplace=True)
ufo.head(2)

Unnamed: 0_level_0,City,Colors_Reported,Shape_Reported,State
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1930-06-01 22:00:00,Ithaca,,TRIANGLE,NY
1930-06-30 20:00:00,Willingboro,,OTHER,NJ


In [32]:
ufo.reset_index(inplace=True)