# Pandas module: Series

In [4]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame   #we will use Series/DataFrame quite often, so load these two as well

In [5]:
#A Series is like a numpy array, but it has data labels.
obj = Series([3,6,9,12])

obj  #note how each value is indexed

0     3
1     6
2     9
3    12
dtype: int64

In [6]:
obj.values  #show a display of all values in the Series

array([ 3,  6,  9, 12], dtype=int64)

In [9]:
obj.index  #show index of the Series

RangeIndex(start=0, stop=4, step=1)

In [10]:
#create series with specific index
#in this case, index = country name
ww2_cas = Series([8700000,4300000,3000000,2100000,400000],index=['USSR','Germany','China','Japan','USA'])

ww2_cas

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64

In [11]:
#we can use the index value to select the series value
#casualties in USA?
ww2_cas['USA']

400000

In [12]:
#array operations actually work with Series
#check which countries had casualties greater than 4 mil

ww2_cas[ww2_cas > 4000000]

USSR       8700000
Germany    4300000
dtype: int64

In [13]:
#check if an index value is in the series index

'USSR' in ww2_cas

True

In [None]:
#Since a series behaves like a dictionary, we can convert series into dictionary

In [14]:
ww2_dict = ww2_cas.to_dict()
ww2_dict

{'USSR': 8700000,
 'Germany': 4300000,
 'China': 3000000,
 'Japan': 2100000,
 'USA': 400000}

In [16]:
#can convert the dictionary back to a series
#just feed the dictionary into Series()
ww2_series = Series(ww2_dict)
ww2_series

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64

In [17]:
countries = ['China','Germany', 'Japan','USA','USSR','Argentina']

In [19]:
obj2 = Series(ww2_dict, index = countries)  #a way to set indexes
obj2   #note Argentina is not in our original series index, it gives a NaN value (null)

China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA           400000.0
USSR         8700000.0
Argentina          NaN
dtype: float64

In [20]:
#Pandas has built-in functions to find NaN and Null values
pd.isnull(obj2)

China        False
Germany      False
Japan        False
USA          False
USSR         False
Argentina     True
dtype: bool

In [21]:
#opposite also works, to check where there is not a null value
pd.notnull(obj2)

China         True
Germany       True
Japan         True
USA           True
USSR          True
Argentina    False
dtype: bool

In [22]:
ww2_series

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64

In [None]:
#add object to series, and it will auto align by index

In [23]:
obj2 #ww2_series and obj2 has same index values except for the etra Argentina

China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA           400000.0
USSR         8700000.0
Argentina          NaN
dtype: float64

In [24]:
ww2_series + obj2

Argentina           NaN
China         6000000.0
Germany       8600000.0
Japan         4200000.0
USA            800000.0
USSR         17400000.0
dtype: float64

In [25]:
#can name the Series

obj2.name = "World War 2 Casualties"
obj2

China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA           400000.0
USSR         8700000.0
Argentina          NaN
Name: World War 2 Casualties, dtype: float64

In [26]:
#can also name the indexes
#similar to excel, you are labeling the column 
obj2.index.name = 'Countries'
obj2

Countries
China        3000000.0
Germany      4300000.0
Japan        2100000.0
USA           400000.0
USSR         8700000.0
Argentina          NaN
Name: World War 2 Casualties, dtype: float64

# DataFrames

In [28]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [29]:
import webbrowser #python's built-in function to grab a webpage
website = 'http://en.wikipedia.org/wiki/NFL_win-loss_records'
webbrowser.open(website)

True

In [31]:
nfl_frame = pd.read_clipboard()  #Pandas can read straight from copied data from webpage (stored in clipboard)

In [32]:
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
0,1,Dallas Cowboys,898,512,380,6,0.573,1960,NFC East
1,2,Chicago Bears,1386,761,583,42,0.564,1920,NFC North
2,3,Green Bay Packers,1352,743,571,38,0.564,1921,NFC North
3,4,New England Patriots[b],900,500,391,9,0.561,1960,AFC East
4,5,Miami Dolphins,816,452,360,4,0.556,1966,AFC East


In [None]:
#Dataframe is like an excel spreadsheet
#note column names on top
#and indexing for the left cols (indexing automated by pandas/python)

In [34]:
#grab column names
#will return the name of every single columns
nfl_frame.columns

Index(['Rank', 'Team', 'GP', 'Won', 'Lost', 'Tied', 'Pct.', 'First NFL Season',
       'Division'],
      dtype='object')

In [35]:
#two ways to grab specific column
nfl_frame.Team

0             Dallas Cowboys
1              Chicago Bears
2          Green Bay Packers
3    New England Patriots[b]
4             Miami Dolphins
Name: Team, dtype: object

In [36]:
nfl_frame['First NFL Season']  #if column name is not just one word, need to spell it out like a string 
#(refers to .columns output for what to call)

0    1960
1    1920
2    1921
3    1960
4    1966
Name: First NFL Season, dtype: int64

In [37]:
#grab multiple columns
#can pass a list of column names in the columns argument
DataFrame(nfl_frame, columns=['Team','First NFL Season','GP'])
#note that this creates a NEW dataframe with the desired columns

Unnamed: 0,Team,First NFL Season,GP
0,Dallas Cowboys,1960,898
1,Chicago Bears,1920,1386
2,Green Bay Packers,1921,1352
3,New England Patriots[b],1960,900
4,Miami Dolphins,1966,816


In [38]:
#What if ask for col that doesn't exist?
DataFrame(nfl_frame, columns=['Team','First NFL Season','GP','Stadium'])
#it doesn't return errors, it just creates the column 'Stadium' with all null values

Unnamed: 0,Team,First NFL Season,GP,Stadium
0,Dallas Cowboys,1960,898,
1,Chicago Bears,1920,1386,
2,Green Bay Packers,1921,1352,
3,New England Patriots[b],1960,900,
4,Miami Dolphins,1966,816,


In [39]:
#grab rows 
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
0,1,Dallas Cowboys,898,512,380,6,0.573,1960,NFC East
1,2,Chicago Bears,1386,761,583,42,0.564,1920,NFC North
2,3,Green Bay Packers,1352,743,571,38,0.564,1921,NFC North
3,4,New England Patriots[b],900,500,391,9,0.561,1960,AFC East
4,5,Miami Dolphins,816,452,360,4,0.556,1966,AFC East


In [41]:
#to retrieve the first few rows, use the head(), by default, first 5 rows

nfl_frame.head(3)

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
0,1,Dallas Cowboys,898,512,380,6,0.573,1960,NFC East
1,2,Chicago Bears,1386,761,583,42,0.564,1920,NFC North
2,3,Green Bay Packers,1352,743,571,38,0.564,1921,NFC North


In [42]:
#end rows (by default, last 5 rows)
nfl_frame.tail(3)

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
2,3,Green Bay Packers,1352,743,571,38,0.564,1921,NFC North
3,4,New England Patriots[b],900,500,391,9,0.561,1960,AFC East
4,5,Miami Dolphins,816,452,360,4,0.556,1966,AFC East


In [44]:
#want to retrieve rows on a specific index
#returns all the information on that row
nfl_frame.loc[3]   #.ix used in Python2

Rank                                      4
Team                New England Patriots[b]
GP                                      900
Won                                     500
Lost                                    391
Tied                                      9
Pct.                                  0.561
First NFL Season                       1960
Division                           AFC East
Name: 3, dtype: object

In [45]:
#assign values for entire column
nfl_frame['Stadium'] = "Levi's Stadium"
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division,Stadium
0,1,Dallas Cowboys,898,512,380,6,0.573,1960,NFC East,Levi's Stadium
1,2,Chicago Bears,1386,761,583,42,0.564,1920,NFC North,Levi's Stadium
2,3,Green Bay Packers,1352,743,571,38,0.564,1921,NFC North,Levi's Stadium
3,4,New England Patriots[b],900,500,391,9,0.561,1960,AFC East,Levi's Stadium
4,5,Miami Dolphins,816,452,360,4,0.556,1966,AFC East,Levi's Stadium


In [46]:
#assign a list of numbers to a column, here from 0 to 4, 5 numbers
nfl_frame['Stadium'] = np.arange(5)

In [47]:
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division,Stadium
0,1,Dallas Cowboys,898,512,380,6,0.573,1960,NFC East,0
1,2,Chicago Bears,1386,761,583,42,0.564,1920,NFC North,1
2,3,Green Bay Packers,1352,743,571,38,0.564,1921,NFC North,2
3,4,New England Patriots[b],900,500,391,9,0.561,1960,AFC East,3
4,5,Miami Dolphins,816,452,360,4,0.556,1966,AFC East,4


In [50]:
#Add a Series to a dataframe
#here, index 4 is Levis and index 0 is AT&T
stadiums = Series(["Levi's Stadium","AT&T Stadium"],index=[4,0])
stadiums

4    Levi's Stadium
0      AT&T Stadium
dtype: object

In [51]:
nfl_frame['Stadium'] = stadiums
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division,Stadium
0,1,Dallas Cowboys,898,512,380,6,0.573,1960,NFC East,AT&T Stadium
1,2,Chicago Bears,1386,761,583,42,0.564,1920,NFC North,
2,3,Green Bay Packers,1352,743,571,38,0.564,1921,NFC North,
3,4,New England Patriots[b],900,500,391,9,0.561,1960,AFC East,
4,5,Miami Dolphins,816,452,360,4,0.556,1966,AFC East,Levi's Stadium


In [52]:
#Delete columns
del nfl_frame['Stadium']
nfl_frame

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
0,1,Dallas Cowboys,898,512,380,6,0.573,1960,NFC East
1,2,Chicago Bears,1386,761,583,42,0.564,1920,NFC North
2,3,Green Bay Packers,1352,743,571,38,0.564,1921,NFC North
3,4,New England Patriots[b],900,500,391,9,0.561,1960,AFC East
4,5,Miami Dolphins,816,452,360,4,0.556,1966,AFC East


In [53]:
#dictionary application
data = {'City':['SF','LA','NYC'],'Population':[837000,3880000,8400000]}
data

{'City': ['SF', 'LA', 'NYC'], 'Population': [837000, 3880000, 8400000]}

In [54]:
#DataFrame can be constructed using dictionary
city_frame = DataFrame(data)
city_frame

Unnamed: 0,City,Population
0,SF,837000
1,LA,3880000
2,NYC,8400000


In [None]:
#Check out the pandas.DataFrame documentation for more information.

# Index Objects

In [55]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

In [56]:
my_ser = Series([1,2,3,4],index=['A','B','C','D']) #set index specifically to ABDC
my_ser

A    1
B    2
C    3
D    4
dtype: int64

In [57]:
my_index = my_ser.index  #recall the method of grabbing all the indexes
my_index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [58]:
my_index[2]  #grabbing index values from the index list

'C'

In [59]:
my_index[2:] #grab index everything past index number 2

Index(['C', 'D'], dtype='object')

In [60]:
my_index[0]

'A'

In [61]:
my_index[0] = 'Z'

#Note that Indexes are immutable
#if want to change index, need to replace the entire list
#this makes dataframe and series very stable

TypeError: Index does not support mutable operations

# Reindex