# Day 3. Introduction to pandas

### Importing modules

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [2]:
# IPython Notebook option to show plots in the notebook (not in a separate window)
%matplotlib inline

### Series creation

A Series is a 1-dimensional data structure capable of holding single data type. It can easily be created using NumPy array.

In [3]:
arr = np.random.random(10)
series = pd.Series(arr)
series

0    0.746088
1    0.751734
2    0.760356
3    0.441144
4    0.924170
5    0.378975
6    0.655647
7    0.636961
8    0.506015
9    0.081832
dtype: float64

## Series slicing and indexing

In [4]:
# Selecting particular element
series[0]

0.746087720547896

In [5]:
# You can also use iloc and loc to acces its elements
series.iloc[0]

0.746087720547896

In [6]:
# You can access multiple elements using slicing
series.iloc[0:4]

0    0.746088
1    0.751734
2    0.760356
3    0.441144
dtype: float64

In [7]:
# We can see a beginning of the Series, default 5 elements
print(series.head())

0    0.746088
1    0.751734
2    0.760356
3    0.441144
4    0.924170
dtype: float64


In [8]:
# And an end of it, default 5 elements
print(series.tail(3))

7    0.636961
8    0.506015
9    0.081832
dtype: float64


## Basic mathematical operations on Series

You can easily performs operations such as addition, subtraction or multiplication using Pandas Serie.

In [9]:
ser_a = pd.Series(
    [
        15,
        20,
        33,
        17,
        4,
    ]
)
ser_b = pd.Series([15, 5, 7])

NumPy is the DataFrame's underlying data structure, actually:

In [10]:
print(type(series.values))

<class 'numpy.ndarray'>


In [11]:
# All operations are performed elementwise and missing values are filled with NaNs
print(ser_a + ser_b)
print(ser_a - ser_b)
print(ser_a * ser_b)
print(ser_a / ser_b)

0    30.0
1    25.0
2    40.0
3     NaN
4     NaN
dtype: float64
0     0.0
1    15.0
2    26.0
3     NaN
4     NaN
dtype: float64
0    225.0
1    100.0
2    231.0
3      NaN
4      NaN
dtype: float64
0    1.000000
1    4.000000
2    4.714286
3         NaN
4         NaN
dtype: float64


### DataFrame creation

A basic data structure in pandas is a DataFrame. It's a 2 dimensional data structure with rows and columns. One can easily create a DataFrame from a NumPy array.

In [12]:
arr = np.random.random([10, 5])
df = pd.DataFrame(arr)
df

Unnamed: 0,0,1,2,3,4
0,0.72187,0.111629,0.806451,0.715954,0.749427
1,0.698741,0.611024,0.106743,0.879385,0.813148
2,0.110785,0.808916,0.918424,0.857874,0.508647
3,0.95733,0.219612,0.526522,0.793173,0.349705
4,0.922507,0.1193,0.139745,0.53106,0.076025
5,0.881841,0.900386,0.55602,0.203753,0.723498
6,0.848798,0.945725,0.505634,0.011348,0.90119
7,0.667643,0.661719,0.318361,0.43638,0.901475
8,0.870707,0.964,0.704737,0.938505,0.308658
9,0.897896,0.317778,0.15175,0.081481,0.065211


We can give names to columns and rows, they don't have to be unique. 

In [13]:
df.columns = ["col_{}".format(i) for i in range(df.shape[1])]
df

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
0,0.72187,0.111629,0.806451,0.715954,0.749427
1,0.698741,0.611024,0.106743,0.879385,0.813148
2,0.110785,0.808916,0.918424,0.857874,0.508647
3,0.95733,0.219612,0.526522,0.793173,0.349705
4,0.922507,0.1193,0.139745,0.53106,0.076025
5,0.881841,0.900386,0.55602,0.203753,0.723498
6,0.848798,0.945725,0.505634,0.011348,0.90119
7,0.667643,0.661719,0.318361,0.43638,0.901475
8,0.870707,0.964,0.704737,0.938505,0.308658
9,0.897896,0.317778,0.15175,0.081481,0.065211


In [14]:
df.index = ["row_{}".format(i) for i in range(df.shape[0])]
df.index.name = "row_number"
df

Unnamed: 0_level_0,col_0,col_1,col_2,col_3,col_4
row_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
row_0,0.72187,0.111629,0.806451,0.715954,0.749427
row_1,0.698741,0.611024,0.106743,0.879385,0.813148
row_2,0.110785,0.808916,0.918424,0.857874,0.508647
row_3,0.95733,0.219612,0.526522,0.793173,0.349705
row_4,0.922507,0.1193,0.139745,0.53106,0.076025
row_5,0.881841,0.900386,0.55602,0.203753,0.723498
row_6,0.848798,0.945725,0.505634,0.011348,0.90119
row_7,0.667643,0.661719,0.318361,0.43638,0.901475
row_8,0.870707,0.964,0.704737,0.938505,0.308658
row_9,0.897896,0.317778,0.15175,0.081481,0.065211


Now we can inspect some basic properties of the DataFrame

In [15]:
print(df.shape)

(10, 5)


In [16]:
print(df.columns)

Index(['col_0', 'col_1', 'col_2', 'col_3', 'col_4'], dtype='object')


In [17]:
print(df.index)

Index(['row_0', 'row_1', 'row_2', 'row_3', 'row_4', 'row_5', 'row_6', 'row_7',
       'row_8', 'row_9'],
      dtype='object', name='row_number')


In [18]:
print(df.dtypes)

col_0    float64
col_1    float64
col_2    float64
col_3    float64
col_4    float64
dtype: object


### DataFrame slicing and indexing

In [19]:
# Selecting particular columns is very easy
df["col_0"]

row_number
row_0    0.721870
row_1    0.698741
row_2    0.110785
row_3    0.957330
row_4    0.922507
row_5    0.881841
row_6    0.848798
row_7    0.667643
row_8    0.870707
row_9    0.897896
Name: col_0, dtype: float64

In [20]:
# or we can do it this way
df.col_0

row_number
row_0    0.721870
row_1    0.698741
row_2    0.110785
row_3    0.957330
row_4    0.922507
row_5    0.881841
row_6    0.848798
row_7    0.667643
row_8    0.870707
row_9    0.897896
Name: col_0, dtype: float64

In [21]:
# DataFrame column is a Series
type(df["col_0"])

pandas.core.series.Series

In [22]:
# We can also select a subset of columns by passing a list inside brackets
df[["col_0", "col_1"]]

Unnamed: 0_level_0,col_0,col_1
row_number,Unnamed: 1_level_1,Unnamed: 2_level_1
row_0,0.72187,0.111629
row_1,0.698741,0.611024
row_2,0.110785,0.808916
row_3,0.95733,0.219612
row_4,0.922507,0.1193
row_5,0.881841,0.900386
row_6,0.848798,0.945725
row_7,0.667643,0.661719
row_8,0.870707,0.964
row_9,0.897896,0.317778


#### Location attribute

In [23]:
# To select a particular row we have to indicate its index in .loc attribute
df.loc["row_0"]

col_0    0.721870
col_1    0.111629
col_2    0.806451
col_3    0.715954
col_4    0.749427
Name: row_0, dtype: float64

In [24]:
# We can of course select a subset of rows
df.loc[["row_0", "row_1"]]

Unnamed: 0_level_0,col_0,col_1,col_2,col_3,col_4
row_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
row_0,0.72187,0.111629,0.806451,0.715954,0.749427
row_1,0.698741,0.611024,0.106743,0.879385,0.813148


In [25]:
# The above is a Data Frame, so we can access its columns
df.loc[["row_0", "row_1"]][["col_1", "col_2"]]

Unnamed: 0_level_0,col_1,col_2
row_number,Unnamed: 1_level_1,Unnamed: 2_level_1
row_0,0.111629,0.806451
row_1,0.611024,0.106743


In [26]:
# or we can access to this subframe directly
df.loc[["row_0", "row_1"], ["col_1", "col_2"]]

Unnamed: 0_level_0,col_1,col_2
row_number,Unnamed: 1_level_1,Unnamed: 2_level_1
row_0,0.111629,0.806451
row_1,0.611024,0.106743


In [27]:
# To access selected item of a Series we have to specify index of the item
print("Type of selected single row: {}".format(type(df.loc["row_0"])))
df.loc["row_0"]["col_3"]

Type of selected single row: <class 'pandas.core.series.Series'>


0.7159541656552846

In [28]:
# or do it directly
df.loc["row_0", "col_3"]

0.7159541656552846

Whereas `.loc` is used to access rows with their indices, `.iloc` is used to access rows with its position.

In [None]:
df.iloc[0]

In [None]:
df.iloc[0]["col_3"]

In [None]:
df.iloc[0, 3]

Similarly to NumPy, you can get slices with a colon `:`

In [None]:
df.loc[:"row_3", "col_2":"col_4"]

But be careful! While selecting with .loc __both__ endpoints of the slices are __included__ in slices! Unlike selecting with .iloc

In [None]:
# 'row_3' is a third row, while accessed with .loc it was included, with .iloc it's not
# similarly for 'col_4'
df.iloc[:3, 2:4]

Now the tricky part:

In [None]:
# This is a particular column
df["col_3"]

In [None]:
# This of course doesn't work, because there is no such columns as 3
df[3]

In [None]:
# But this one works, and takes first three ROWS
df[:3]

In [None]:
# You can achieve the same result with iloc
df.iloc[:3, :]

A faster way to get a scalar value:

In [None]:
# With rows and columns names
df.at["row_3", "col_1"]

In [None]:
# Or with its position
df.iat[3, 1]

In [None]:
# but this doesn't work, because you must select a scalar
df.at["row_3"]

In [None]:
# nor this one. Again: you can access only scalar values
df.at[:"row_3", "col_3"]

There are also some nice methods to sort data

In [None]:
# By default it sorts rows
df.sort_index(ascending=False)

In [None]:
# But it can sort columns
df.sort_index(axis=1, ascending=False)

In [None]:
# And it can sort rows by a column value
df.sort_values(by="col_2")