# Pandas Series and DataFrame Objects

## Pandas Series

Series are similar to NumPy arrays, except that we gan give them a named or datetime index.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# creating a series by converting object types to a series
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([10,20,30]) # numpy array
d = {'a':10, 'b':20, 'c':100} # dictionary

In [3]:
# create a series from a list
pd.Series(my_list)

0    10
1    20
2    30
dtype: int64

In [4]:
# create a series from a list with index
pd.Series(my_list,index=labels)

a    10
b    20
c    30
dtype: int64

In [5]:
# create a series from a numpy array
pd.Series(arr,index=labels)

a    10
b    20
c    30
dtype: int64

In [6]:
# create a series from a dictionary
pd.Series(d)

a     10
b     20
c    100
dtype: int64

In [7]:
pd.Series(data=labels) # strings as data

0    a
1    b
2    c
dtype: object

In [8]:
# passing list of build in functions to a series
pd.Series([sum,print,len]) # extremely uncommon and rarely used

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [9]:
# using an index with a pandas series
ser1 = pd.Series([1,2,3,4],index=['USA','CHINA','FRANCE','GERMANY'])

In [10]:
ser1

USA        1
CHINA      2
FRANCE     3
GERMANY    4
dtype: int64

In [11]:
ser2 = pd.Series([1,2,3,4],index=['USA','CHINA','ITALY','JAPAN'])

In [12]:
# selecting by index
ser1['USA']

1

In [13]:
ser2['JAPAN']

4

In [14]:
# trying to select something that's not in the series
# ser1['CANADA'] # returns a key error

In [15]:
# adding series adds up values
# adds values if there's a match, otherwise a null value
ser1 + ser2

CHINA      4.0
FRANCE     NaN
GERMANY    NaN
ITALY      NaN
JAPAN      NaN
USA        2.0
dtype: float64

## Pandas DataFrame

In [16]:
# numpy and pandas are already imported
from numpy.random import randn

In [17]:
# setting a seed to make sure we're getting the same random numbers
np.random.seed(101)

In [18]:
# dataframe with random data and index (rows/columns)
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [19]:
df # each column that gets returned is a pandas series

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [20]:
type(df)

pandas.core.frame.DataFrame

In [21]:
# grabbing a column from a data frame
df['W'] # returns series

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [22]:
df.W # also works but not recommended

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [23]:
type(df['W'])

pandas.core.series.Series

In [24]:
# grabbing a list of columns
df[['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
C,-2.018168,0.528813
D,0.188695,-0.933237
E,0.190794,2.605967


In [25]:
# creating column by defining it like it already exists
df['new'] = df['W'] + df['Y']

In [26]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [27]:
# remove a column
# axis needs to be specified for column
df.drop('new',axis=1) # doesn't occur in place

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [28]:
df.drop('new',axis=1,inplace=True) # inplace needs to be true

In [29]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [30]:
# df.shape returns a tuple with the zero index as the number of rows
df.shape # rows are zero axis, columns are one axis

(5, 4)

In [31]:
# selecting ROWS in a data frame
# (1) loc (location) method
# passing in the label of the index returns series
df.loc['C'] # returns a series

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [32]:
# grabbing a row by index position
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [33]:
# selecting subsets of rows and columns
df.loc['B']['Y'] # returns single cell

-0.8480769834036315

In [34]:
# subset
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


In [37]:
# conditional selection using bracket notation
booldf = df > 0 # returns a dataframe with boolean values

In [38]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [40]:
# passing boolean df to original df
df[booldf] # df with value for TRUE and NaN for FALSE

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [41]:
# shortcut
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [42]:
# returning rows or columns where a condition is true
df['W'] > 0 # returns series

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [45]:
# filtering out rows based on a column value
df[df['W']>0] # returns only rows where condition is met

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509
