# Pandas

In [64]:
import pandas as pd
import numpy as np

### Build DataFrame from CSV file

In [65]:
cars = pd.read_csv("datasets/cars.csv")
cars

Unnamed: 0,country_id,cars_per_cap,country,drives_right
0,US,809,United States,True
1,AUS,731,Australia,False
2,JAP,588,Japan,False
3,IN,18,India,False
4,RU,200,Russia,True
5,MOR,70,Morocco,True
6,EG,45,Egypt,True


In [66]:
brics = pd.read_csv("datasets/brics.csv", index_col=0)
brics

Unnamed: 0,country,population,area,capital
BR,Brazil,200,8515767,Brasilia
RU,Russia,144,17098242,Moscow
IN,India,1252,3287590,New Delhi
CH,China,1357,9596961,Beijing
SA,South Africa,55,1221037,Pretoria


In [67]:
column_names = ['year', 'month', 'day', 'dec_date', 'sunspots', 'definite']
silso = pd.read_csv('datasets/silso.csv', header=None, names=column_names, na_values={'sunspots': [' -1']}, 
                    parse_dates=[[0, 1, 2]])
silso

Unnamed: 0,year_month_day,dec_date,sunspots,definite
0,1818-01-01,1818.004,,1
1,1818-01-02,1818.007,,1
2,1818-01-03,1818.01,,1
3,1818-01-04,1818.012,22.0,1
4,1818-01-05,1818.015,,1
5,1818-01-06,1818.018,,1
6,1818-01-07,1818.02,,1
7,1818-01-08,1818.023,46.0,1
8,1818-01-09,1818.026,59.0,1
9,1818-01-10,1818.029,63.0,1


In [68]:
stock_data = pd.read_csv('datasets/stock_data_messy.tsv', delimiter=' ', header=3, comment='#')
stock_data

Unnamed: 0,name,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,IBM,156.08,160.01,159.81,165.22,172.25,167.15,164.75,152.77,145.36,146.11,137.21,137.96
1,MSFT,45.51,43.08,42.13,43.47,47.53,45.96,45.61,45.51,43.56,48.7,53.88,55.4
2,GOOGLE,512.42,537.99,559.72,540.5,535.24,532.92,590.09,636.84,617.93,663.59,735.39,755.35
3,APPLE,110.64,125.43,125.97,127.29,128.76,127.81,125.34,113.39,112.8,113.36,118.16,111.73


### Create DataFrame from dictionary

In [69]:
users_dict = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
              'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
              'visitors': [139, 237, 326, 456],
              'signups': [7, 12, 3, 5]}
users_df = pd.DataFrame(users_dict)
users_df

Unnamed: 0,city,signups,visitors,weekday
0,Austin,7,139,Sun
1,Dallas,12,237,Sun
2,Austin,3,326,Mon
3,Dallas,5,456,Mon


In [70]:
heights = [59.1, 61.3, 64.7, 60.5, 64.4, 65.2, 61.8]
heights_sex_dict = {'height': heights, 'sex': 'M'}
heights_sex_df = pd.DataFrame(heights_sex_dict)
heights_sex_df

Unnamed: 0,height,sex
0,59.1,M
1,61.3,M
2,64.7,M
3,60.5,M
4,64.4,M
5,65.2,M
6,61.8,M


### Create DataFrame from list

In [71]:
array_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
array_df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


### Writing files

In [72]:
stock_data.to_csv('datasets/stock_data_clean.csv', index=False)

In [73]:
silso.to_excel('datasets/silso.xlsx')

In [74]:
brics.head()

Unnamed: 0,country,population,area,capital
BR,Brazil,200,8515767,Brasilia
RU,Russia,144,17098242,Moscow
IN,India,1252,3287590,New Delhi
CH,China,1357,9596961,Beijing
SA,South Africa,55,1221037,Pretoria


In [75]:
brics.head(2)

Unnamed: 0,country,population,area,capital
BR,Brazil,200,8515767,Brasilia
RU,Russia,144,17098242,Moscow


In [76]:
brics.tail()

Unnamed: 0,country,population,area,capital
BR,Brazil,200,8515767,Brasilia
RU,Russia,144,17098242,Moscow
IN,India,1252,3287590,New Delhi
CH,China,1357,9596961,Beijing
SA,South Africa,55,1221037,Pretoria


In [77]:
brics.tail(2)

Unnamed: 0,country,population,area,capital
CH,China,1357,9596961,Beijing
SA,South Africa,55,1221037,Pretoria


### DataFrame indexes and columns

In [78]:
type(brics)

pandas.core.frame.DataFrame

In [79]:
brics.columns

Index(['country', 'population', 'area', 'capital'], dtype='object')

In [80]:
type(brics.columns)

pandas.indexes.base.Index

In [81]:
brics.index

Index(['BR', 'RU', 'IN', 'CH', 'SA'], dtype='object')

In [82]:
type(brics.index)

pandas.indexes.base.Index

In [83]:
array_df.columns=["A", "B", "C"]
array_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
A    3 non-null int64
B    3 non-null int64
C    3 non-null int64
dtypes: int64(3)
memory usage: 152.0 bytes


In [84]:
silso.index = silso['year_month_day']
silso.index.name = 'date'
silso

Unnamed: 0_level_0,year_month_day,dec_date,sunspots,definite
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1818-01-01,1818-01-01,1818.004,,1
1818-01-02,1818-01-02,1818.007,,1
1818-01-03,1818-01-03,1818.01,,1
1818-01-04,1818-01-04,1818.012,22.0,1
1818-01-05,1818-01-05,1818.015,,1
1818-01-06,1818-01-06,1818.018,,1
1818-01-07,1818-01-07,1818.02,,1
1818-01-08,1818-01-08,1818.023,46.0,1
1818-01-09,1818-01-09,1818.026,59.0,1
1818-01-10,1818-01-10,1818.029,63.0,1


In [85]:
brics.shape

(5, 4)

In [86]:
brics.size

20

In [87]:
brics.count()

country       5
population    5
area          5
capital       5
dtype: int64

In [88]:
brics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, BR to SA
Data columns (total 4 columns):
country       5 non-null object
population    5 non-null int64
area          5 non-null int64
capital       5 non-null object
dtypes: int64(2), object(2)
memory usage: 200.0+ bytes


In [89]:
silso.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10 entries, 1818-01-01 to 1818-01-10
Data columns (total 4 columns):
year_month_day    10 non-null datetime64[ns]
dec_date          10 non-null float64
sunspots          4 non-null float64
definite          10 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 400.0 bytes


### DataFrame Series (cols & rows)

In [90]:
country = brics['country']
country

BR          Brazil
RU          Russia
IN           India
CH           China
SA    South Africa
Name: country, dtype: object

In [91]:
type(country)

pandas.core.series.Series

In [109]:
brics.country

BR          Brazil
RU          Russia
IN           India
CH           China
SA    South Africa
Name: country, dtype: object

In [110]:
brics.get('country')

BR          Brazil
RU          Russia
IN           India
CH           China
SA    South Africa
Name: country, dtype: object

In [113]:
brics.loc['BR']

country         Brazil
population         200
area           8515767
capital       Brasilia
on_earth          True
Name: BR, dtype: object

In [114]:
type(brics.loc['BR'])

pandas.core.series.Series

In [92]:
country.head(2)

BR    Brazil
RU    Russia
Name: country, dtype: object

In [93]:
country.tail(2)

CH           China
SA    South Africa
Name: country, dtype: object

In [94]:
list_series = pd.Series([1, 2, 3, 4])
list_series

0    1
1    2
2    3
3    4
dtype: int64

### Broadcasting

In [95]:
users_df['fees'] = 0
users_df

Unnamed: 0,city,signups,visitors,weekday,fees
0,Austin,7,139,Sun,0
1,Dallas,12,237,Sun,0
2,Austin,3,326,Mon,0
3,Dallas,5,456,Mon,0


In [96]:
brics['on_earth'] = [True, True, True, True, True]
brics

Unnamed: 0,country,population,area,capital,on_earth
BR,Brazil,200,8515767,Brasilia,True
RU,Russia,144,17098242,Moscow,True
IN,India,1252,3287590,New Delhi,True
CH,China,1357,9596961,Beijing,True
SA,South Africa,55,1221037,Pretoria,True


### Slicing DataFrame

In [100]:
brics.iloc[:2,:]

Unnamed: 0,country,population,area,capital,on_earth
BR,Brazil,200,8515767,Brasilia,True
RU,Russia,144,17098242,Moscow,True


In [101]:
brics.iloc[-2:,:]

Unnamed: 0,country,population,area,capital,on_earth
CH,China,1357,9596961,Beijing,True
SA,South Africa,55,1221037,Pretoria,True


In [102]:
brics.iloc[1:3,:]

Unnamed: 0,country,population,area,capital,on_earth
RU,Russia,144,17098242,Moscow,True
IN,India,1252,3287590,New Delhi,True


In [106]:
brics.iloc[:,1:-1]

Unnamed: 0,population,area,capital
BR,200,8515767,Brasilia
RU,144,17098242,Moscow
IN,1252,3287590,New Delhi
CH,1357,9596961,Beijing
SA,55,1221037,Pretoria


### Sub-DataFrame (cols & rows)

In [46]:
brics.loc[['BR']]

Unnamed: 0,country,population,area,capital,on_earth
BR,Brazil,200,8515767,Brasilia,True


In [47]:
brics.loc[['BR', 'CH']]

Unnamed: 0,country,population,area,capital,on_earth
BR,Brazil,200,8515767,Brasilia,True
CH,China,1357,9596961,Beijing,True


In [61]:
brics[['country']]

Unnamed: 0,country
BR,Brazil
RU,Russia
IN,India
CH,China
SA,South Africa


In [48]:
silso = silso[['sunspots', 'definite']]
silso

Unnamed: 0_level_0,sunspots,definite
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1818-01-01,,1
1818-01-02,,1
1818-01-03,,1
1818-01-04,22.0,1
1818-01-05,,1
1818-01-06,,1
1818-01-07,,1
1818-01-08,46.0,1
1818-01-09,59.0,1
1818-01-10,63.0,1


In [115]:
brics.loc[['BR', 'CH'], ['population', 'area']]

Unnamed: 0,population,area
BR,200,8515767
CH,1357,9596961


### Element access

In [118]:
brics['capital'].loc['CH']

'Beijing'

In [119]:
type(brics['capital'].loc['CH'])

str

In [116]:
brics.loc['CH']['capital']

'Beijing'

In [121]:
brics.loc['CH', 'capital']

'Beijing'

### DataFrame 'n' NumPy

In [49]:
brics_area_log10 = np.log10(brics[['area']])
brics_area_log10

Unnamed: 0,area
BR,6.930224
RU,7.232951
IN,6.516878
CH,6.982134
SA,6.086729


In [50]:
type(brics_area_log10)

pandas.core.frame.DataFrame

In [51]:
brics.index.values

array(['BR', 'RU', 'IN', 'CH', 'SA'], dtype=object)

In [52]:
type(brics.index.values)

numpy.ndarray

In [53]:
brics.columns.values

array(['country', 'population', 'area', 'capital', 'on_earth'], dtype=object)

In [54]:
type(brics.columns.values)

numpy.ndarray

In [55]:
brics.values

array([['Brazil', 200, 8515767, 'Brasilia', True],
       ['Russia', 144, 17098242, 'Moscow', True],
       ['India', 1252, 3287590, 'New Delhi', True],
       ['China', 1357, 9596961, 'Beijing', True],
       ['South Africa', 55, 1221037, 'Pretoria', True]], dtype=object)

In [56]:
type(brics.values)

numpy.ndarray

In [57]:
brics['country'].values

array(['Brazil', 'Russia', 'India', 'China', 'South Africa'], dtype=object)

In [58]:
type(brics['country'].values)

numpy.ndarray

In [59]:
brics.loc['BR'].values

array(['Brazil', 200, 8515767, 'Brasilia', True], dtype=object)

In [60]:
type(brics.loc['BR'].values)

numpy.ndarray