# Pandas
---
*  Built on top of numpy (so numpy must already be installed and working)
*  Fast analysis and data cleaning and preparation
*  Install using __pip install pandas__
*  Install using __conda install pandas__  (note you may need to use __conda uninstall pandas__ first)


# Series
---

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [4]:
#  Various arrays and dict
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10,'b':20,'c':30}

In [12]:
#  pass in a list to Series
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [15]:
#  Pass in a list and label the indices with the letter list
pd.Series(data = my_data,index=labels)

a    10
b    20
c    30
dtype: int64

In [16]:
#  Note  you don't need to specify "data=" and "index="
pd.Series(my_data,labels)

a    10
b    20
c    30
dtype: int64

In [18]:
#  pass in a numpy array as well, same as a list
pd.Series(arr)

0    10
1    20
2    30
dtype: int32

In [19]:
#  pass in a dictionaary for data, takes the keys as indices
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [21]:
#  you can have a series with string values 
pd.Series(labels)

0    a
1    b
2    c
dtype: object

# Using a Series Index
---

In [27]:
#  integer series with country index values
ser1 = pd.Series([1,2,3,4],['USA','Germany','USSR','Japan'])

In [23]:
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [24]:
#  pd.Series(data,index)
ser2 = pd.Series([1,2,5,4],['USA','Germany','Italy','Japan'])

In [25]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [28]:
#  index by country 
ser1['USA']

1

In [29]:
ser3 = pd.Series(labels)

In [30]:
ser3

0    a
1    b
2    c
dtype: object

In [31]:
#  index by value (starting with 0 index)
ser3[0]

'a'

In [32]:
#  series addition, for labels not in both result is NaN
ser1 + ser2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

# Dataframes
---

In [33]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [34]:
#  set random seed value
np.random.seed(101)

In [35]:
#  dataframe(data,index,columns)
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [36]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [37]:
#  grabbing a column, it's a series
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [40]:
#  proof the column is a series
type(df['W'])

pandas.core.series.Series

In [41]:
#  This also works for grabbing the W column (not recommended)
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [45]:
#  get multiple columns using passing in a column key list 
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [47]:
#  creating a new column using addition of others
df['new'] = df['W'] + df['Y']

In [48]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [50]:
#  removing a column, note we need to use axis=1 for columns
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [51]:
#  remains unchanged 
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [52]:
#  permanently remove using inplace = True
df.drop('new',axis=1,inplace=True)

In [54]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [57]:
# delete a row (not in place)
df.drop('E',axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [56]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [58]:
#  tuple (row,column) dimension
df.shape

(5, 4)

In [60]:
#  Selecting Rows, in a series
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [61]:
#  numerical row index
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [62]:
df.loc['C']==df.iloc[2]

W    True
X    True
Y    True
Z    True
Name: C, dtype: bool

In [63]:
#  Subsets of rows and columns df.loc[row,column]
df.loc['B','Y']

-0.84807698340363147

In [67]:
#  subset the data frame by using row and column input lists 
#df.loc[row_list,column_list]
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
