# Pandas

### Tabular dataset examples

* temperature, measured_at, location
    * row = observations, column = variable
    
* BRICKS countries
    * country, capital, area, population
    * Brazil, Russia, India, China, South Africa
    

### Datasets in Python
* 2D Numpy array?
    * One data type
* Pandas!
    * High level data manipulation tool
    * Developed by Wes McKinney
    * Built on Numpy
    * DataFrame

In [2]:
# DataFram from Dictionary, manually

dict = {
    "country":["Brazil", "Russia", "India", "China", "South Africa"],
    "capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
    "area":[8.516, 17.10, 3.286, 9.597, 1.221], 
    "population": [200.4, 143.5, 1252, 1357, 52.98]}

import pandas as pd

brics = pd.DataFrame(dict)

brics

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,1.221,52.98


In [3]:
# set row label manually
brics.index = ["BR", "RU", "IN", "CH", "SA"]
brics

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.516,200.4
RU,Russia,Moscow,17.1,143.5
IN,India,New Delhi,3.286,1252.0
CH,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


In [13]:
!pwd
!ls -l | grep csv

/home/mlitsey/Documents/projects/Learning_AI/DC
-rw-rw-r--. 1 mlitsey mlitsey    197 Nov 26 13:26 brics.csv


In [11]:
# import from csv file
brics = pd.read_csv("/home/mlitsey/Documents/projects/Learning_AI/DC/brics.csv")
brics

Unnamed: 0.1,Unnamed: 0,country,capital,area,population
0,BR,Brazil,Brasilia,8.516,200.4
1,RU,Russia,Moscow,17.1,143.5
2,IN,India,New Delhi,3.286,1252.0
3,CH,China,Beijing,9.597,1357.0
4,SA,South Africa,Pretoria,1.221,52.98


In [12]:
# set the row labels
brics = pd.read_csv("/home/mlitsey/Documents/projects/Learning_AI/DC/brics.csv", index_col = 0)
brics

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.516,200.4
RU,Russia,Moscow,17.1,143.5
IN,India,New Delhi,3.286,1252.0
CH,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


In [14]:
# Exercise 1

# Pre-defined lists
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]

# Import pandas as pd
import pandas as pd

# Create dictionary my_dict with three key:value pairs: my_dict
my_dict = {
    'country':names,
    'drives_right':dr,
    'cars_per_cap':cpc}

# Build a DataFrame cars from my_dict: cars
cars = pd.DataFrame(my_dict)

# Print cars
print(cars)

         country  drives_right  cars_per_cap
0  United States          True           809
1      Australia         False           731
2          Japan         False           588
3          India         False            18
4         Russia          True           200
5        Morocco          True            70
6          Egypt          True            45


In [16]:
# Exercise 2

import pandas as pd

# Build cars DataFrame
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]
dict = { 'country':names, 'drives_right':dr, 'cars_per_cap':cpc }
cars = pd.DataFrame(dict)
#print(cars)

# Definition of row_labels
row_labels = ['US', 'AUS', 'JAP', 'IN', 'RU', 'MOR', 'EG']

# Specify row labels of cars
cars.index = row_labels

# Print cars again
print(cars)

           country  drives_right  cars_per_cap
US   United States          True           809
AUS      Australia         False           731
JAP          Japan         False           588
IN           India         False            18
RU          Russia          True           200
MOR        Morocco          True            70
EG           Egypt          True            45


In [18]:
# Exercise 3

# Import pandas as pd
import pandas as pd

# Import the cars.csv data: cars
cars = pd.read_csv("cars.csv")

# Print out cars
print(cars)

  Unnamed: 0        country  drives_right  cars_per_cap
0         US  United States          True           809
1        AUS      Australia         False           731
2        JAP          Japan         False           588
3         IN          India         False            18
4         RU         Russia          True           200
5        MOR        Morocco          True            70
6         EG          Egypt          True            45


In [20]:
# Exercise 4

# Import pandas as pd
import pandas as pd

# Fix import by including index_col
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out cars
print(cars)

           country  drives_right  cars_per_cap
US   United States          True           809
AUS      Australia         False           731
JAP          Japan         False           588
IN           India         False            18
RU          Russia          True           200
MOR        Morocco          True            70
EG           Egypt          True            45


### Index and Select Data
* Square brackets
* Advanced methods
    * loc
    * iloc

In [21]:
# Column Access [ ] 

brics["country"]

BR          Brazil
RU          Russia
IN           India
CH           China
SA    South Africa
Name: country, dtype: object

In [22]:
type(brics["country"])
# series are 1D labelled array

pandas.core.series.Series

In [27]:
brics[["country"]]


Unnamed: 0,country
BR,Brazil
RU,Russia
IN,India
CH,China
SA,South Africa


In [28]:
type(brics[["country"]])

pandas.core.frame.DataFrame

In [29]:
brics[["country", "capital"]]

Unnamed: 0,country,capital
BR,Brazil,Brasilia
RU,Russia,Moscow
IN,India,New Delhi
CH,China,Beijing
SA,South Africa,Pretoria


In [30]:
# Row Access [ ] 

brics[1:4]

Unnamed: 0,country,capital,area,population
RU,Russia,Moscow,17.1,143.5
IN,India,New Delhi,3.286,1252.0
CH,China,Beijing,9.597,1357.0


### Discussion [ ]
* Sqare brackets: limited functionality
* Ideally
    * 2D Numpy arrays
    * my_array[ rows , columns ]
* Pandas
    * loc (label-based)
    * iloc (integer position-based)

In [31]:
# Row Access loc
brics.loc["RU"]
# row as pandas series

country       Russia
capital       Moscow
area            17.1
population     143.5
Name: RU, dtype: object

In [32]:
brics.loc[["RU"]]
# row as DataFrame

Unnamed: 0,country,capital,area,population
RU,Russia,Moscow,17.1,143.5


In [33]:
brics.loc[["RU", "IN", "CH"]]

Unnamed: 0,country,capital,area,population
RU,Russia,Moscow,17.1,143.5
IN,India,New Delhi,3.286,1252.0
CH,China,Beijing,9.597,1357.0


In [34]:
# Row & Column loc
brics.loc[["RU", "IN", "CH"], ["country", "capital"]]

Unnamed: 0,country,capital
RU,Russia,Moscow
IN,India,New Delhi
CH,China,Beijing


In [35]:
# all rows but selected columns

brics.loc[:, ["country", "capital"]]

Unnamed: 0,country,capital
BR,Brazil,Brasilia
RU,Russia,Moscow
IN,India,New Delhi
CH,China,Beijing
SA,South Africa,Pretoria


### Recap
* Square brackets
    * Column access: brics[["country", "capital"]]
    * Row access: only through slicing: brics[1:4]
* loc (label-based)
    * Row access: brics.loc[["RU", "IN", "CH"]]
    * Column access: brics.loc[:, ["country", "capital"]]
    * Row & Column access: brics.loc[["RU", "IN", "CH"], ["country", "capital"]]

In [36]:
# Row Access iloc
brics.loc[["RU"]]
brics.iloc[[1]]

Unnamed: 0,country,capital,area,population
RU,Russia,Moscow,17.1,143.5


In [37]:
brics.iloc[[1,2,3]]

Unnamed: 0,country,capital,area,population
RU,Russia,Moscow,17.1,143.5
IN,India,New Delhi,3.286,1252.0
CH,China,Beijing,9.597,1357.0


In [38]:
brics.iloc[[1,2,3], [0,1]]

Unnamed: 0,country,capital
RU,Russia,Moscow
IN,India,New Delhi
CH,China,Beijing


In [39]:
brics.iloc[:, [0,1]]

Unnamed: 0,country,capital
BR,Brazil,Brasilia
RU,Russia,Moscow
IN,India,New Delhi
CH,China,Beijing
SA,South Africa,Pretoria


In [47]:
# Exercise 5

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)
#print(cars)
# Print out country column as Pandas Series
print(cars['country'])

# Print out country column as Pandas DataFrame
print(cars[["country"]])

# Print out DataFrame with country and drives_right columns
print(cars[["country", "drives_right"]])

US     United States
AUS        Australia
JAP            Japan
IN             India
RU            Russia
MOR          Morocco
EG             Egypt
Name: country, dtype: object
           country
US   United States
AUS      Australia
JAP          Japan
IN           India
RU          Russia
MOR        Morocco
EG           Egypt
           country  drives_right
US   United States          True
AUS      Australia         False
JAP          Japan         False
IN           India         False
RU          Russia          True
MOR        Morocco          True
EG           Egypt          True


In [51]:
# Exercise 6

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out first 3 observations
print(cars[:3])

# Print out fourth, fifth and sixth observation
print(cars[3:6])

           country  drives_right  cars_per_cap
US   United States          True           809
AUS      Australia         False           731
JAP          Japan         False           588
     country  drives_right  cars_per_cap
IN     India         False            18
RU    Russia          True           200
MOR  Morocco          True            70


In [54]:
# Exercise 7

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out observation for Japan
#print(cars.loc['JAP'])
print(cars.iloc[2])

# Print out observations for Australia and Egypt
print(cars.loc[['AUS', 'EG']])
#print(cars.iloc[[1,6]])

country         Japan
drives_right    False
cars_per_cap      588
Name: JAP, dtype: object
       country  drives_right  cars_per_cap
AUS  Australia         False           731
EG       Egypt          True            45


In [55]:
# Exercise 8

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out drives_right value of Morocco
print(cars.iloc[[5],[2]])

# Print sub-DataFrame
print(cars.iloc[[4,5],[1,2]])

     cars_per_cap
MOR            70
     drives_right  cars_per_cap
RU           True           200
MOR          True            70


In [56]:
# Exercise 9

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out drives_right column as Series
print(cars['drives_right'])

# Print out drives_right column as DataFrame
print(cars.iloc[:,[2]])

# Print out cars_per_cap and drives_right as DataFrame

print(cars.loc[:,["cars_per_cap","drives_right"]])

US      True
AUS    False
JAP    False
IN     False
RU      True
MOR     True
EG      True
Name: drives_right, dtype: bool
     cars_per_cap
US            809
AUS           731
JAP           588
IN             18
RU            200
MOR            70
EG             45
     cars_per_cap  drives_right
US            809          True
AUS           731         False
JAP           588         False
IN             18         False
RU            200          True
MOR            70          True
EG             45          True
