In [None]:
# preamble to be able to run notebooks in Jupyter and Colab
try:
    from google.colab import drive
    import sys
    
    drive.mount('/content/drive')
    notes_home = "/content/drive/Shared drives/CSC310/ds/notes/"
    user_home = "/content/drive/My Drive/"
    
    sys.path.insert(1,notes_home) # let the notebook access the notes folder

except ModuleNotFoundError:
    notes_home = "" # running native Jupyter environment -- notes home is the same as the notebook
    user_home = ""  # under Jupyter we assume the user directory is the same as the notebook

# Data Manipulation with Pandas

Pandas supports 1-D (Series), 2-D (DataFrame), and 3-D (Panel) data structures.  Here we cover DataFrames because they most closely resemble the kind of data tables data scientists mostly look at.

The advantage of Pandas is that it stores the data together with its *metadata*.

The most often used meta data with Pandas are the **column names** and the **index**.


In [None]:
import pandas
import numpy # for random number generation

In [None]:
df = pandas.read_csv(notes_home+"assets/mammals.csv")

In [None]:
df

Unnamed: 0,Legs,Wings,Fur,Feathers,Mammal
0,4,no,yes,no,True
1,2,yes,no,yes,False
2,4,no,no,no,False
3,4,yes,yes,no,True
4,3,no,no,no,False


# DataFrame Parts

A dataframe is composed of different parts that work together to give a coherent view of the data:

In [None]:
df.columns

Index(['Legs', 'Wings', 'Fur', 'Feathers', 'Mammal'], dtype='object')

In [None]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [None]:
df.values

array([[4, 'no', 'yes', 'no', True],
       [2, 'yes', 'no', 'yes', False],
       [4, 'no', 'no', 'no', False],
       [4, 'yes', 'yes', 'no', True],
       [3, 'no', 'no', 'no', False]], dtype=object)

We can change the parts of the data.  For example, we can create a new index for our dataframe:

In [None]:
df.index = ['Dog', 'Duck', 'Frog', 'Bat', 'Bar Stool']

In [None]:
df

Unnamed: 0,Legs,Wings,Fur,Feathers,Mammal
Dog,4,no,yes,no,True
Duck,2,yes,no,yes,False
Frog,4,no,no,no,False
Bat,4,yes,yes,no,True
Bar Stool,3,no,no,no,False


# Indexing and Slicing

For array-style indexing Pandas  uses the **loc**, **iloc**, and **ix** indexers. 

Using the **iloc** indexer, we can index the underlying array as if it is a simple array using row and column integer values (hence the i in iloc). The DataFrame index and column labels are maintained in the result:

In [None]:
df

Unnamed: 0,Legs,Wings,Fur,Feathers,Mammal
Dog,4,no,yes,no,True
Duck,2,yes,no,yes,False
Frog,4,no,no,no,False
Bat,4,yes,yes,no,True
Bar Stool,3,no,no,no,False


In [None]:
df.iloc[:2,1:4]

Unnamed: 0,Wings,Fur,Feathers
Dog,no,yes,no
Duck,yes,no,yes


Using the **loc** indexer we can index the underlying data in an array-like style but using the explicit index and column names:

In [None]:
df.loc[:'Duck','Wings':'Feathers']

Unnamed: 0,Wings,Fur,Feathers
Dog,no,yes,no
Duck,yes,no,yes


Notice that when slicing with an explicit index (i.e., data.loc['a':'c']), the final index is included in the slice, while when slicing with an implicit index (i.e., data.iloc[0:2]), the final index is excluded from the slice.

The indexer **ix** allows the mix of integer and explicit indexing.

# Data Access Patterns

We can use relational and boolean expressions when selecting data from a dataframe.

In order to see that we have to realize that there is another simple way to select frame columns:

In [None]:
df[['Wings', 'Mammal']] # using a list of column names to access columns

Unnamed: 0,Wings,Mammal
Dog,no,True
Duck,yes,False
Frog,no,False
Bat,yes,True
Bar Stool,no,False


Relational Operators:

In [None]:
df[df.Wings == 'yes'] # accessing rows for which the equality holds

Unnamed: 0,Legs,Wings,Fur,Feathers,Mammal
Duck,2,yes,no,yes,False
Bat,4,yes,yes,no,True


In [None]:
df[df.Wings == 'yes'].Mammal # accessing attribute values for rows for which the equality holds

Duck    False
Bat      True
Name: Mammal, dtype: bool

In [None]:
df[(df.Wings == 'yes') & (df.Fur == 'yes')] # boolean operations

Unnamed: 0,Legs,Wings,Fur,Feathers,Mammal
Bat,4,yes,yes,no,True


# Filtering

Boolean indexing using a Boolean

In [None]:
df.Mammal

Dog           True
Duck         False
Frog         False
Bat           True
Bar Stool    False
Name: Mammal, dtype: bool

In [None]:
df.Mammal.dtype

dtype('bool')

In [None]:
df[df.Mammal == True] # or we could have just said df[df.Mammal]

Unnamed: 0,Legs,Wings,Fur,Feathers,Mammal
Dog,4,no,yes,no,True
Bat,4,yes,yes,no,True


Filtering using **isin()**

In [None]:
df.index.isin(['Dog','Bat'])

array([ True, False, False,  True, False])

Any Boolean vector can be used as filter.

In [None]:
v = df.index.isin(['Dog','Bat'])
df[v]

Unnamed: 0,Legs,Wings,Fur,Feathers,Mammal
Dog,4,no,yes,no,True
Bat,4,yes,yes,no,True


# Combining DataFrames

Using concat() along axis 1 (columns)
 

In [None]:
df1 = df.iloc[:,:2]
df1

Unnamed: 0,Legs,Wings
Dog,4,no
Duck,2,yes
Frog,4,no
Bat,4,yes
Bar Stool,3,no


In [None]:
df2 = df.iloc[:,2:]
df2

Unnamed: 0,Fur,Feathers,Mammal
Dog,yes,no,True
Duck,no,yes,False
Frog,no,no,False
Bat,yes,no,True
Bar Stool,no,no,False


In [None]:
pandas.concat([df1,df2],axis=1)

Unnamed: 0,Legs,Wings,Fur,Feathers,Mammal
Dog,4,no,yes,no,True
Duck,2,yes,no,yes,False
Frog,4,no,no,no,False
Bat,4,yes,yes,no,True
Bar Stool,3,no,no,no,False


**Note** The two dataframes have to agree on the index!

Creating a new index on df2:

In [None]:
df2.reset_index(drop=True, inplace=True)
df2

Unnamed: 0,Fur,Feathers,Mammal
0,yes,no,True
1,no,yes,False
2,no,no,False
3,yes,no,True
4,no,no,False


In [None]:
pandas.concat([df1,df2],axis=1)

Unnamed: 0,Legs,Wings,Fur,Feathers,Mammal
0,,,yes,no,True
1,,,no,yes,False
2,,,no,no,False
3,,,yes,no,True
4,,,no,no,False
Bar Stool,3.0,no,,,
Bat,4.0,yes,,,
Dog,4.0,no,,,
Duck,2.0,yes,,,
Frog,4.0,no,,,


Using concat() along axis 0 (rows)


In [None]:
pieces = [df.iloc[:2,:],df.iloc[2:,:]]
pieces

[      Legs Wings  Fur Feathers  Mammal
 Dog      4    no  yes       no    True
 Duck     2   yes   no      yes   False,
            Legs Wings  Fur Feathers  Mammal
 Frog          4    no   no       no   False
 Bat           4   yes  yes       no    True
 Bar Stool     3    no   no       no   False]

In [None]:
pandas.concat(pieces,axis=0)

Unnamed: 0,Legs,Wings,Fur,Feathers,Mammal
Dog,4,no,yes,no,True
Duck,2,yes,no,yes,False
Frog,4,no,no,no,False
Bat,4,yes,yes,no,True
Bar Stool,3,no,no,no,False


**Note** dataframes have to agree on column names!

**append** and **assign** functions work very similarly.

# Missing or Duplicated Data
* Pandas flags missing values with NaN (not a number).
* In most cases, any computations applied to a dataframe with NaNs will ignore the NaNs
* However, it is still a good idea to clean up the dataframe
* In general we have two options to deal with missing data:
 * Either drop the row or columns that has NaNs
 * Or try to substitute a reasonable value for the NaN
 

Generate a dataset with NaNs

In [None]:
df = pandas.DataFrame(numpy.random.randn(4, 3), index=['a', 'c', 'd', 'e'], 
                  columns=['one', 'two', 'three'])
df

Unnamed: 0,one,two,three
a,0.631913,0.679473,-0.50408
c,-0.558343,-0.705497,0.968311
d,0.695184,0.426475,-0.516462
e,0.934925,0.377162,0.60551


Generating NaNs

In [None]:
df2 = df.reindex(['a', 'b', 'c', 'd', 'e'])
df2

Unnamed: 0,one,two,three
a,0.631913,0.679473,-0.50408
b,,,
c,-0.558343,-0.705497,0.968311
d,0.695184,0.426475,-0.516462
e,0.934925,0.377162,0.60551


In [None]:
# find the places where the NaNs are
df2.isnull()

Unnamed: 0,one,two,three
a,False,False,False
b,True,True,True
c,False,False,False
d,False,False,False
e,False,False,False


In [None]:
# look at the values of the isnull dataframe
df2.isnull().values

array([[False, False, False],
       [ True,  True,  True],
       [False, False, False],
       [False, False, False],
       [False, False, False]])

In [None]:
# find out how many values are missing
# NOTE: sum treats 'True' as 1 and 'False' as 0 
df2.isnull().values.sum()

3

In [None]:
# drop rows that have NaNs
df2.dropna(how='any',axis=0)

Unnamed: 0,one,two,three
a,0.631913,0.679473,-0.50408
c,-0.558343,-0.705497,0.968311
d,0.695184,0.426475,-0.516462
e,0.934925,0.377162,0.60551


In [None]:
# dropping columns that have NaNs
# NOTE: this is NOT always a good idea -- empty dataframe!
df2.dropna(how='any',axis=1)

a
b
c
d
e


# Replacing Missing Data

We can also try to estimate the missing data - **impute** it.

We replace the missing values by the means of each column.

In [None]:
df2

Unnamed: 0,one,two,three
a,0.631913,0.679473,-0.50408
b,,,
c,-0.558343,-0.705497,0.968311
d,0.695184,0.426475,-0.516462
e,0.934925,0.377162,0.60551


In [None]:
# compute the mean of each column
df2.mean()

one      0.425920
two      0.194404
three    0.138320
dtype: float64

In [None]:
# fill the missing values in each column
for c in df.columns:
    df2[c].fillna(df[c].mean(), inplace=True)

df2

Unnamed: 0,one,two,three
a,0.631913,0.679473,-0.50408
b,0.42592,0.194404,0.13832
c,-0.558343,-0.705497,0.968311
d,0.695184,0.426475,-0.516462
e,0.934925,0.377162,0.60551


# Broadcasting

Binary arithmetic operators are applied element by element to dataframes assuming equal sized dataframes.

Broadcasting refers to the fact that Python will reuse elements of the smaller dataframe or will reuse a scalar in order to complete the binary operation.


In [None]:
df = pandas.DataFrame([[1,2],[3,4]])
df

Unnamed: 0,0,1
0,1,2
1,3,4


In [None]:
# element by element operation
df + df

Unnamed: 0,0,1
0,2,4
1,6,8


In [None]:
# broadcasting the smaller vector
# NOTE: each element of the vector is applied to 
#       a column in the dataframe
df + [10, 20]

Unnamed: 0,0,1
0,11,22
1,13,24


In [None]:
# broadcasting a scalar
# NOTE: the scalar is applied to ALL elements
#       of the dataframe
df + 10

Unnamed: 0,0,1
0,11,12
1,13,14


In [None]:
# we can now say things like this
df + df == 2*df

Unnamed: 0,0,1
0,True,True
1,True,True


# Duplicate Data

Identify and remove duplicate rows in a DataFrame
 

In [None]:
# make a dataframe with duplicate rows 'b' and 'e'
df2.iloc[4,:] = df2.iloc[1,:]
df2

Unnamed: 0,one,two,three
a,0.631913,0.679473,-0.50408
b,0.42592,0.194404,0.13832
c,-0.558343,-0.705497,0.968311
d,0.695184,0.426475,-0.516462
e,0.42592,0.194404,0.13832


In [None]:
# check if there is duplication
df2.duplicated()

a    False
b    False
c    False
d    False
e     True
dtype: bool

In [None]:
# drop e!
df2.drop_duplicates()

Unnamed: 0,one,two,three
a,0.631913,0.679473,-0.50408
b,0.42592,0.194404,0.13832
c,-0.558343,-0.705497,0.968311
d,0.695184,0.426475,-0.516462


By default **duplicated()** and **drop_duplicates()** keep the first and identify other reoccuring instances as duplicates.

# Reading

* 2 [The Basics of NumPy Arrays](https://jakevdp.github.io/PythonDataScienceHandbook/02.02-the-basics-of-numpy-arrays.html)
* 3.2 [Data Indexing and Selection](https://jakevdp.github.io/PythonDataScienceHandbook/03.02-data-indexing-and-selection.html)
* 3.3 [Operating on Data in Pandas](https://jakevdp.github.io/PythonDataScienceHandbook/03.03-operations-in-pandas.html)
* 3.4 [Handling Missing Data](https://jakevdp.github.io/PythonDataScienceHandbook/03.04-missing-values.html)
* 3.6 [Combining Datasets: Concat and Append](https://jakevdp.github.io/PythonDataScienceHandbook/03.06-concat-and-append.html)
* 3.7 [Combining Datasets: Merge and Join](https://jakevdp.github.io/PythonDataScienceHandbook/03.07-merge-and-join.html)
* 3.8 [Aggregation and Grouping](https://jakevdp.github.io/PythonDataScienceHandbook/03.08-aggregation-and-grouping.html)