Pandas is an open source library built on top of NumPy


# Series

In [7]:
# Series are very similar to a NumPy array.
# In fact, its built on top of a NumPy object.
# But what differentiates the Numpy Array from the Pandas Series is that a series can have axis labels.

In [8]:
import numpy as np

In [9]:
import pandas as pd

In [10]:
labels = ['a','b','c'] # a list object
my_data = [10,20,30] # a list object
arr = np.array(my_data) # a NumPy array
d = {'a':10,'b':20,'c':30} # a dictionary

In [11]:
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [12]:
pd.Series(data = my_data, index = labels)

a    10
b    20
c    30
dtype: int64

In [13]:
# We could only just say that
pd.Series(my_data, labels)

a    10
b    20
c    30
dtype: int64

In [14]:
# We can just pass in a NumPy array
pd.Series(arr)

0    10
1    20
2    30
dtype: int32

In [15]:
# We can also pass in a dictionary to a Pandas Series
# Note that it takes the key values of the dictionary and assigns them as labels
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [16]:
# A series can actually hold pretty much almost any type of data object in Python as its data point
# Interestingly, it can even hold built-in functions as its data point
pd.Series(data = [sum, print, len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [17]:
ser1 = pd.Series([1,2,3,4],['USA', 'Germany', 'USSR', 'Japan'])

In [18]:
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [19]:
ser2 = pd.Series([1,2,5,4],['USA','Germany','Italy','Japan'])

In [20]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [21]:
# Note that above in the series ser1 and ser2, the data type is int64 as it refers to data being integer.

In [22]:
ser1['USA']

1

In [23]:
ser3 = pd.Series(data = labels)

In [24]:
ser3

0    a
1    b
2    c
dtype: object

In [25]:
# Note that above in the ser3 series, the data type is object as it refers to data being string.

In [26]:
ser3[0]

'a'

In [27]:
# We can add two pandas series. What will happen is that, it will look for the data points that are common
# in both pandas series, if not, it will display it as null. See the example below 

In [28]:
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [29]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [30]:
ser1 + ser2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

In [31]:
# Italy and USSR are not common in both of the pandas series, therefore they'd been displayed as NaN.

In [32]:
# Something to note here is that when we are performing operations with pandas series
# and pretty much any NumPy or Pandas based object, our integers are going to be 
# converted into floats. And thats so we don't accidentally lose information 
# based off of some weird division.

# DataFrames

DataFrames are going to be our main tool when working with Pandas.

In [33]:
import numpy as np
import pandas as pd

In [34]:
from numpy.random import randn

In [35]:
np.random.seed(101)

In [36]:
df = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D', 'E'],['W','X','Y','Z'])

In [37]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [38]:
# Each of these columns above are actually Pandas series. The column W, X, Y are all Pandas series objects.

In [39]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [40]:
# Notice that now it actually looks like a Pandas series.
# We can even check it by using
type(df['W'])

pandas.core.series.Series

In [41]:
type(df)

pandas.core.frame.DataFrame

In [42]:
# There are two different ways that we can grab a column from a DataFrame. 
# The main way and the way we should always do it is using the square bracken notation
# and then passing the column name.

# However, if we are really familiar with SQL, a lot of time when we are selecting
# a column, we just write the DataFrame and then adding a dot and then pass in the
# column name. That actually works too. See the example below.

In [43]:
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [44]:
# Since it can be confused with the methods of a DataFrame object, using the this
# latter notation is not recommended.

In [45]:
# If we want to get multiple columns using the square bracket notation,
# we could simply give a list of column names that we want to get.
df[['X', 'Y']]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
C,0.740122,0.528813
D,-0.758872,-0.933237
E,1.978757,2.605967


In [46]:
# Note that when we grab multiple columns, it returns a DataFrame object.
# Not a Pandas Series!
type(df[['X', 'Y']])

pandas.core.frame.DataFrame

In [47]:
# Creating new columns
df['new'] = df['W'] + df['Y']

In [48]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [50]:
# For removing columns
df.drop('new')

KeyError: "['new'] not found in axis"

In [51]:
# This is a really important error.
# See that the default value for axis in a DataFrame object is 0.
# This means that it tries to drop ROWS not columns.
# And since there isn't a row with the name 'new', we get an error.
# To specify that we want to drop a column, we need to give the 
# axis parameter the value of 1!
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [52]:
# If we want to drop the row E, we could just go with
# the default value of the axis, which is 0.
# So we don't need to specift anything.
df.drop('C')

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [53]:
# An importing thing to note is, we haven't actually changed
# the actual DataFrame object df yet. Check
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [54]:
# To change it
df = df.drop('new', axis=1)

In [55]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [56]:
# OR we can also specify it in the drop method using parameters

In [57]:
# Adding the 'new' column again
df['new'] = df['W'] + df['Y']

In [58]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [59]:
# If we want to delete the 'new' column from our DataFrame for good,
# we need to use the value TRUE as the parameter for inplace.

df.drop('new', axis=1, inplace=True)

In [60]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [61]:
# The reason why we need to specify axis=1 whenever we want to drop a column
# and that we don't need to specify axis=0 whenever we want to drop a row
# is actually coming from the mathematical matrix notation.

# When we say a matrix is 5 by 4, or 5x4, or [5,4] it means that the matrix
# has 5 rows and 4 columns. And now considering the notation [5,4] as a python
# list, the first number has the index of 0 and the second number has the 
# index of 1. Hence, the notation...

In [62]:
# Selecting ROWS

In [63]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [64]:
# There are 2 ways to grab a row. One is the label based and the other is
# the location based.

# We use a method to grab a row, but interestingly, we don't use normal
# brackets () with this method, but we use square brackets [], so be aware!

In [65]:
# Label based method.
df.loc['D']

W    0.188695
X   -0.758872
Y   -0.933237
Z    0.955057
Name: D, dtype: float64

In [66]:
# Note that not only the columns but also the rows too are series in a DataFrame.

type(df.loc['D'])

pandas.core.series.Series

In [67]:
# Index based method.
df.iloc[2] # The row C

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [68]:
# Selecting rows and columns.

df.loc['B','Y']

-0.8480769834036315

In [69]:
# Check
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [70]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


# Conditional Selections

In [71]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [72]:
booldf = df > 0

In [73]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [74]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [75]:
# The usual notation is that we skip the first two step and write it as this
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [76]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [77]:
# We can now use the series above to filter out rows
# based off of columns values.

df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [78]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [79]:
df[df['X'] < 0]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


In [80]:
# Consider this case:
# When we want to take out the rows in which the column X
# has negative values, and then work with the remaining DataFrame
# after that operation is done, we could do the following.

newdf = df[df['X'] > 0]

In [81]:
# And now we can work with the remaining DataFrame
newdf

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [82]:
# Such as
newdf['X']

A    0.628133
C    0.740122
E    1.978757
Name: X, dtype: float64

In [83]:
# However, we don't actually need all these middle steps.
# Since we still get a DataFrame when we take out some 
# rows or some columns based on the conditions we give
# like df[df[X] > 0] for example, we can still work on that

df[df['X'] > 0]['X']

A    0.628133
C    0.740122
E    1.978757
Name: X, dtype: float64

# Using more than one conditional statements

In [84]:
df[(df['W']>0) and (df['Y']>1)]  # Error

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [85]:
# We see the error message of "The truth value of a Series is ambiguous."
# What that is saying is that Python's normal 'and' operator cannot
# actually take into account a Series of boolean values compared to
# another Series of boolean values.

# An 'and' operator can only take into account single booleans at a time.

In [86]:
# To be able to use more than one conditional statements, we use the 
# notation '&' instead of writing 'and'.
df[(df['W']>0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [87]:
# So when we are comparing multiple boolean values at a time,
# we cannot use Python's normal 'and' operator.
# We have to use the 'and percent' notation: &

# Resetting the index or setting it to something else

In [88]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [89]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [90]:
# Note that this method doesn't occur in place unless we specift it
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [91]:
newind = 'IST ANK IZM MUN BER'.split()

In [92]:
newind

['IST', 'ANK', 'IZM', 'MUN', 'BER']

In [93]:
df['States'] = newind

In [94]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,IST
B,0.651118,-0.319318,-0.848077,0.605965,ANK
C,-2.018168,0.740122,0.528813,-0.589001,IZM
D,0.188695,-0.758872,-0.933237,0.955057,MUN
E,0.190794,1.978757,2.605967,0.683509,BER


In [95]:
# If we have a column in our DataFrame that we want it to be the index
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IST,2.70685,0.628133,0.907969,0.503826
ANK,0.651118,-0.319318,-0.848077,0.605965
IZM,-2.018168,0.740122,0.528813,-0.589001
MUN,0.188695,-0.758872,-0.933237,0.955057
BER,0.190794,1.978757,2.605967,0.683509


In [96]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,IST
B,0.651118,-0.319318,-0.848077,0.605965,ANK
C,-2.018168,0.740122,0.528813,-0.589001,IZM
D,0.188695,-0.758872,-0.933237,0.955057,MUN
E,0.190794,1.978757,2.605967,0.683509,BER


# Multi-index and Index Hierarchy

In [97]:
import numpy as np
import pandas as pd

In [98]:
outside = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
inside = [1, 2, 3, 1, 2, 3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [99]:
# To understand what is going on
list(zip(outside,inside))

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [100]:
# The purpose of this multi-layered indexing can be understood by the following example
df = pd.DataFrame(randn(6,2),hier_index,['A', 'B'])

In [101]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [102]:
# To call a data from a multi-layered DataFrame
df.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [103]:
df.loc['G1'].loc[1]

A    0.302665
B    1.693723
Name: 1, dtype: float64

In [104]:
# Notice that on top of the G1, G2 column and also 1,2,3 column, we don't have a column label.
df.index.names

FrozenList([None, None])

In [105]:
# To add names
df.index.names = ['Groups', 'Num']

In [106]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [107]:
df.loc['G2'].loc[2]

A    0.807706
B    0.072960
Name: 2, dtype: float64

In [108]:
df.loc['G2'].loc[2]['B']

0.07295967531703869

## Cross-section

In [109]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [110]:
# When we want to grab everything under G1, we can
# easily do this by using .loc function and then
# specifiying 'G1'.

# But imagine that we want the rows where the Num
# column has the value of 1. This is really tricky to
# do using .loc function.

# Luckily, we have an option for multi-level indexing
# a function called xs().

# We can use this xs() function just as we used .loc to
# get the values under G1.

df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [111]:
# But where this xs() function becomes handy is the following use:

df.xs(1, level='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502


# Missing Data in Pandas

In [112]:
import numpy as np
import pandas as pd

In [113]:
d = {'A':[1,2,np.nan], 'B':[5,np.nan,np.nan], 'C':[1,2,3]}

In [114]:
df = pd.DataFrame(d)

In [115]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [117]:
# A lot of times, we will just want to drop the missing values
# from our DataFrame objects, especially if its just a few values.
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [118]:
# What happens above is that the method dropna() just drops every row
# that contains a NaN value.

In [119]:
# If we wanted to do the same operations on columns, we could do:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [120]:
# We can also specify a threshold
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [121]:
# Above, we specified the dropna() method so that it requires
# at least 2 non-NaN values in a row to be able to keep it.

# It just drops that rows that don't satisfy this condition.

In [122]:
import numpy as np

In [123]:
features = np.loadtxt('asd.txt', delimiter=',')

ValueError: could not convert string to float: 'asd'