## The grandfather of data viz - matplotlib & seaborn
### How to build a line chart using time series data

In [1]:
import pandas as pd
from pathlib import Path
import os
import numpy

In [2]:
# new visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [87]:
# read in the data
path2data = Path.joinpath(Path.cwd().parent,'data','ufo.csv')
df = pd.read_csv(path2data)
df.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [88]:
# check out time variable
df['Time'].describe()

count              80543
unique             68901
top       7/4/2014 22:00
freq                  45
Name: Time, dtype: object

In [89]:
df['Time'].dtype

dtype('O')

In [90]:
# convert from string to datetime format
df['newtime'] = pd.to_datetime(df['Time'])
print(df['newtime'].dtype)

datetime64[ns]


In [91]:
df[['Time','newtime']].head()

Unnamed: 0,Time,newtime
0,6/1/1930 22:00,1930-06-01 22:00:00
1,6/30/1930 20:00,1930-06-30 20:00:00
2,2/15/1931 14:00,1931-02-15 14:00:00
3,6/1/1931 13:00,1931-06-01 13:00:00
4,4/18/1933 19:00,1933-04-18 19:00:00


In [92]:
# create new variables
df['Date'] = df['newtime'].dt.date
df.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,newtime,Date
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,1930-06-01 22:00:00,1930-06-01
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,1930-06-30 20:00:00,1930-06-30
2,Holyoke,,OVAL,CO,2/15/1931 14:00,1931-02-15 14:00:00,1931-02-15
3,Abilene,,DISK,KS,6/1/1931 13:00,1931-06-01 13:00:00,1931-06-01
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,1933-04-18 19:00:00,1933-04-18


In [93]:
# create a few more
df['Year'] = df['newtime'].dt.year
df.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,newtime,Date,Year
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,1930-06-01 22:00:00,1930-06-01,1930
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,1930-06-30 20:00:00,1930-06-30,1930
2,Holyoke,,OVAL,CO,2/15/1931 14:00,1931-02-15 14:00:00,1931-02-15,1931
3,Abilene,,DISK,KS,6/1/1931 13:00,1931-06-01 13:00:00,1931-06-01,1931
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,1933-04-18 19:00:00,1933-04-18,1933


In [94]:
# create more
df['Month'] = df['newtime'].dt.month
df['Day'] = df['newtime'].dt.day
df['Hour'] = df['newtime'].dt.hour
df['Weekday'] = df['newtime'].dt.weekday
df.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,newtime,Date,Year,Month,Day,Hour,Weekday
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,1930-06-01 22:00:00,1930-06-01,1930,6,1,22,6
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,1930-06-30 20:00:00,1930-06-30,1930,6,30,20,0
2,Holyoke,,OVAL,CO,2/15/1931 14:00,1931-02-15 14:00:00,1931-02-15,1931,2,15,14,6
3,Abilene,,DISK,KS,6/1/1931 13:00,1931-06-01 13:00:00,1931-06-01,1931,6,1,13,0
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,1933-04-18 19:00:00,1933-04-18,1933,4,18,19,1


In [95]:
# also allows date time math
# what is the extent from first to last UFO
print(df['newtime'].max())
print(df['newtime'].min())
print('difference:',df['Year'].max()-df['Year'].min())

2014-09-05 05:30:00
1930-06-01 22:00:00
difference: 84


# Create a weekday variable using .map

In [96]:
df['weekday_name'] = df['Weekday'].map({0:'Sunday',
                                         1:'Monday',
                                         2:'Tuesday',
                                         3:'Wednesday',
                                         4:'Thursday',
                                         5:'Friday',
                                         6:'Saturday'})

In [97]:
df.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,newtime,Date,Year,Month,Day,Hour,Weekday,weekday_name
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,1930-06-01 22:00:00,1930-06-01,1930,6,1,22,6,Saturday
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,1930-06-30 20:00:00,1930-06-30,1930,6,30,20,0,Sunday
2,Holyoke,,OVAL,CO,2/15/1931 14:00,1931-02-15 14:00:00,1931-02-15,1931,2,15,14,6,Saturday
3,Abilene,,DISK,KS,6/1/1931 13:00,1931-06-01 13:00:00,1931-06-01,1931,6,1,13,0,Sunday
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,1933-04-18 19:00:00,1933-04-18,1933,4,18,19,1,Monday


In [98]:
df['Date'].value_counts(ascending=False).head(5)

2014-07-04    258
2010-07-04    202
2012-07-04    188
1999-11-16    187
2013-07-04    177
Name: Date, dtype: int64

In [99]:
# to create a linechart, we want a "counter" column
df['sightings'] = 1
df.head(3)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,newtime,Date,Year,Month,Day,Hour,Weekday,weekday_name,sightings
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,1930-06-01 22:00:00,1930-06-01,1930,6,1,22,6,Saturday,1
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,1930-06-30 20:00:00,1930-06-30,1930,6,30,20,0,Sunday,1
2,Holyoke,,OVAL,CO,2/15/1931 14:00,1931-02-15 14:00:00,1931-02-15,1931,2,15,14,6,Saturday,1


In [100]:
# now add a group by statement to cluster data around dates
dates_df = df.groupby('Date')['sightings'].sum()
df.drop('sightings', axis = 1)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,newtime,Date,Year,Month,Day,Hour,Weekday,weekday_name
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,1930-06-01 22:00:00,1930-06-01,1930,6,1,22,6,Saturday
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,1930-06-30 20:00:00,1930-06-30,1930,6,30,20,0,Sunday
2,Holyoke,,OVAL,CO,2/15/1931 14:00,1931-02-15 14:00:00,1931-02-15,1931,2,15,14,6,Saturday
3,Abilene,,DISK,KS,6/1/1931 13:00,1931-06-01 13:00:00,1931-06-01,1931,6,1,13,0,Sunday
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,1933-04-18 19:00:00,1933-04-18,1933,4,18,19,1,Monday
5,Valley City,,DISK,ND,9/15/1934 15:30,1934-09-15 15:30:00,1934-09-15,1934,9,15,15,5,Friday
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00,1935-06-15 00:00:00,1935-06-15,1935,6,15,0,5,Friday
7,Alma,,DISK,MI,7/15/1936 0:00,1936-07-15 00:00:00,1936-07-15,1936,7,15,0,2,Tuesday
8,Eklutna,,CIGAR,AK,10/15/1936 17:00,1936-10-15 17:00:00,1936-10-15,1936,10,15,17,3,Wednesday
9,Hubbard,,CYLINDER,OR,6/15/1937 0:00,1937-06-15 00:00:00,1937-06-15,1937,6,15,0,1,Monday


In [101]:
# now 'drop' the sightings variable, it's not needed
dates_df.sort_values(ascending=False).head()

Date
2014-07-04    258
2010-07-04    202
2012-07-04    188
1999-11-16    187
2013-07-04    177
Name: sightings, dtype: int64

In [102]:
# conert to a dataframe
dates_df = pd.DataFrame(dates_df)
dates_df = dates_df.reset_index()
dates_df.head(5)
# dates_df.sort_values('sightings',ascending=False).head(3)

Unnamed: 0,Date,sightings
0,1930-06-01,1
1,1930-06-30,1
2,1931-02-15,1
3,1931-06-01,1
4,1933-04-18,1


In [107]:
df.drop('sightings',axis=1,inplace=True)

In [108]:
# let's merge that back into the original dataframe
df2 = pd.merge(df,dates_df, on = 'Date', how = 'left')
df2.sample(5)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,newtime,Date,Year,Month,Day,Hour,Weekday,weekday_name,sightings
76544,Piscataway,,OVAL,NJ,3/18/2014 10:10,2014-03-18 10:10:00,2014-03-18,2014,3,18,10,1,Monday,12
33521,Bayside,,TRIANGLE,NY,8/10/2005 1:00,2005-08-10 01:00:00,2005-08-10,2005,8,10,1,2,Tuesday,12
40085,Bridgeport,,CIRCLE,CT,5/26/2007 0:00,2007-05-26 00:00:00,2007-05-26,2007,5,26,0,5,Friday,12
69079,Chesapeake,,CIRCLE,VA,3/29/2013 21:20,2013-03-29 21:20:00,2013-03-29,2013,3,29,21,4,Thursday,17
77372,Purcellville,,OTHER,VA,5/1/2014 20:30,2014-05-01 20:30:00,2014-05-01,2014,5,1,20,3,Wednesday,38


In [117]:
# Trend over years
# !conda install -y -c anaconda seaborn=0.9.0


# import seaborn as sns
sns.set(rc={'figure.figsize':(18,6)})
sns.lineplot(x = 'Year', y = 'sightings', data = df2)

AttributeError: module 'seaborn' has no attribute 'lineplot'

In [None]:
!conda install -y -c anaconda seaborn=0.9.0

Collecting package metadata (repodata.json): done
Solving environment: \ 

In [118]:
# Trend over months
sns.lineplot(x='Month',y='sightings',data=df2);

AttributeError: module 'seaborn' has no attribute 'lineplot'

# Multiple lines on same plot

In [119]:
# create some new columns for specific colors
import numpy as np
# np.where
df['Colors Reported'].value_counts().head()

ORANGE    5216
RED       4809
GREEN     1897
BLUE      1855
YELLOW     842
Name: Colors Reported, dtype: int64

In [122]:
# Create new color variables
df['Orange'] = np.where(df['Colors Reported']=='ORANGE',1,0)
df['Red'] = np.where(df['Colors Reported']=='RED',1,0)
df['Yellow'] = np.where(df['Colors Reported']=='YELLOW',1,0)
df['Green'] = np.where(df['Colors Reported']=='GREEN',1,0)


In [None]:
df['color'] = 'other'
df,loc[df['Colors Reported']=='ORANGE','color']='orange'
df,loc[df['Colors Reported']=='RED','color']='red'
df,loc[df['Colors Reported']=='GREEN','color']='green'

In [None]:
df['sighting']=1
dates = df.groupby(['Year','color'])['sighting'].sum()
dates.head(3)
