In [26]:
import pandas as pd
import numpy as np

In [27]:
# in millions
g7_pop = pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523])
# Series is a way of indexing a list
# Pandas stores this in numpy array

In [28]:
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
dtype: float64

Someone might not know we're representing population in millions
of inhabitants. Series can have a name, to better document the 
purpose of the Series.

In [29]:
g7_pop.name = 'G7 Population in millions'

In [30]:
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: G7 Population in millions, dtype: float64

Series are pretty similar to numpy arrays:

In [31]:
g7_pop.values

array([ 35.467,  63.951,  80.94 ,  60.665, 127.061,  64.511, 318.523])

They are back by numpy arrays

In [32]:
type(g7_pop.values)

numpy.ndarray

And they look like simple Python lists or Numpy Arrays. But the're actually more similar to Python dicts
A Series has an index, that's similar to the automatic index assigned to Python's Lists:

In [33]:
g7_pop[0]

35.467

In [34]:
g7_pop[1]

63.951

In [35]:
g7_pop.index

RangeIndex(start=0, stop=7, step=1)

But, in contrast to lists, we can explicitly define the index:

In [36]:
g7_pop.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [37]:
g7_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

Looks like a dict but dict are not ordered
Series are ordered but look like dict

# Indexing

Indexing works similarly to lists and dictionaries, you use the index of the element you're looking for


In [38]:
g7_pop['Canada']

35.467

In [39]:
g7_pop['Japan']

127.061

Numeric position can also be used, with the iloc attribute:

In [40]:
g7_pop.iloc[0]

35.467

In [41]:
g7_pop[-1]

318.523

In [42]:
g7_pop[['Italy', 'France']]

Italy     60.665
France    63.951
Name: G7 Population in millions, dtype: float64

In [43]:
g7_pop.iloc[[0,1]] # slicing is not like python in that it includes [0:2] index 2

Canada    35.467
France    63.951
Name: G7 Population in millions, dtype: float64

# Conditional Selection (boolean arrays)

The same boolean array techniques we saw applied to numpy arrays can be used for Pandas Series:

In [44]:
g7_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [45]:
g7_pop > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: G7 Population in millions, dtype: bool

In [46]:
g7_pop[g7_pop > 70]

Germany           80.940
Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [47]:
g7_pop.mean()

107.30257142857144

In [48]:
g7_pop[g7_pop > g7_pop.mean()]

Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [49]:
g7_pop.std()

97.24996987121581

In [50]:
 ~ not
 | or
 & and

SyntaxError: invalid syntax (783337949.py, line 1)

In [None]:
g7_pop[(g7_pop > g7_pop.mean() - g7_pop.std() / 2) | (g7_pop > g7_pop.mean() + g7_pop.std() / 2)]

# Operations and methods

Series also support vectorized operations and aggregation functions as Numpy:

In [None]:
g7_pop * 1_000_000

In [None]:
g7_pop.mean()

In [None]:
np.log(g7_pop)

In [None]:
g7_pop['France',: 'Italy'].mean()

# Boolean Arrays

Work same as numpy

In [101]:
g7_pop

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     318.523
Name: G7 Population in millions, dtype: float64

In [102]:
g7_pop > 80

Canada            True
France            True
Germany           True
Italy             True
Japan             True
United Kingdom    True
United States     True
Name: G7 Population in millions, dtype: bool

In [103]:
g7_pop[g7_pop > 80]

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     318.523
Name: G7 Population in millions, dtype: float64

In [104]:
g7_pop[(g7_pop > 80) | (g7_pop < 40)]

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     318.523
Name: G7 Population in millions, dtype: float64

In [None]:
g7_pop[(g7_pop >80) & (g7_pop < 200)]

# Modifying Series

In [None]:
g7_pop['Canada'] = 40.5

In [None]:
g7_pop

In [None]:
g7_pop.iloc[-1] = 500

In [51]:
g7_pop[g7_pop < 70] = 99.99

In [52]:
g7_pop

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     318.523
Name: G7 Population in millions, dtype: float64

# Data Frames

Think of data frames as multiple series

In [53]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075,
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067,
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915,
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America',
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

The columns attribute is optional. We are using it to keep the same order

In [54]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


Above data frame was given standard numerical index 

In [55]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [56]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [57]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [58]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [60]:
df.size

35

In [61]:
df.shape


(7, 5)

In [62]:
df.describe() # gives us summary statistics 

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [63]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [64]:
df.dtypes.value_counts()

float64    2
int64      2
object     1
dtype: int64

# Indexing, Selection and Slicing

Individual columns in the DataFrame can be selected with regular indexing. Each column is represented as a Series:

.loc # allows you to select rows by index df.loc['whatever']

.iloc # allows you to select rows by sequential/numerical position df.iloc[-1]

select column by df['whatever']

In [65]:
df['Population']

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

Note that the indes of the returned Series is the same as the DataFrame one. And its name is the name of the column. If you are working on a notebook and want to see a more DataFrame-like format you can use the to_frame method:

In [66]:
df.[['Population'].to_frame()]

SyntaxError: invalid syntax (1268493582.py, line 1)

Multiple Columns can also be selected similarly to numpy and Series:

In [None]:
df[['Population', 'GDP']]

In this case, the result is another DataFrame. Slicing works differelty, it acts at 'row level', and can be counter intuitive

In [None]:
df[1:3] # 0 is the column header, weird don't use regular slicing use loc or iloc

In [92]:
df.loc['Italy']

Population              60.665
GDP                    2167744
Surface Area            301336
HDI                      0.873
Continent               Europe
Language               English
GDP Per Capita    35733.025633
Name: Italy, dtype: object

In [93]:
df.iloc[2]

Population               80.94
GDP                    3874437
Surface Area            357114
HDI                      0.916
Continent               Europe
Language               English
GDP Per Capita    47868.013343
Name: Germany, dtype: object

As a second argument you can pass column(s) you want:

In [94]:
df.loc['France': 'Italy', 'Population']

France     63.951
Germany    80.940
Italy      60.665
Name: Population, dtype: float64

In [95]:
df.loc['France': 'Italy', ['Population', 'GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744


In [96]:
df.iloc[1:3, 3]

France     0.888
Germany    0.916
Name: HDI, dtype: float64

In [97]:
df.iloc[[0, 1, -1]]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP Per Capita
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,English,44310.284437
United States,318.523,17348075,9525067,0.915,America,English,54464.12033


In [98]:
df.iloc[1:3, [3]]

Unnamed: 0,HDI
France,0.888
Germany,0.916


In [99]:
df.iloc[1:3, 1:3]

Unnamed: 0,GDP,Surface Area
France,2833687,640679
Germany,3874437,357114


In [100]:
df.iloc[1:3, [1, 3]]

Unnamed: 0,GDP,HDI
France,2833687,0.888
Germany,3874437,0.916


# Dropping Stuff

In [None]:
# .drop is a immutable operation 
# underling data is not changed 

In [None]:
df.drop('Canada')


In [None]:
df.drop(columns=['Population', 'HDI'])

In [None]:
df

# Operations 

In [67]:
df[['Population', 'GDP']] / 100 # this did not change underling data 

Unnamed: 0,Population,GDP
Canada,0.35467,17853.87
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60665,21677.44
Japan,1.27061,46023.67
United Kingdom,0.64511,29500.39
United States,3.18523,173480.75


In [68]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


Operations with Series work at a column level, broadcasting down the rows (which can be counter intuitive)

In [69]:
crisis = pd.Series([-1_000_000, -0.3], index=['GDP', 'HDI'])
crisis

GDP   -1000000.0
HDI         -0.3
dtype: float64

In [70]:
df[['GDP', 'HDI']]

Unnamed: 0,GDP,HDI
Canada,1785387,0.913
France,2833687,0.888
Germany,3874437,0.916
Italy,2167744,0.873
Japan,4602367,0.891
United Kingdom,2950039,0.907
United States,17348075,0.915


# Modifying Data Frames 

In [71]:
langs = pd.Series( # series can be column header or data in row
    ['French', 'German', 'Italian'], 
    index=['France', 'Germany', 'Italy'], # index is row
    name='Language'
)

In [72]:
df['Language'] = langs # this kinda works like a dict key which would = column header

In [73]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


Replacing values per column

In [74]:
df['Language'] = 'English'

In [75]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


Renaming Columns

In [76]:
 df.rename(
   columns={
       'HDI': 'Human Development Index',
       'Annual Popcorn Consumption': 'APC'
   }, index={
       'United States': 'USA',
       'United Kingdom': 'UK',
       'Argentina': 'AR'
   })


Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,English


# Operations of Columns and Creating New Columns from other Columns 

In [77]:
df['GDP']/df['Population']

Canada            50339.385908
France            44310.284437
Germany           47868.013343
Italy             35733.025633
Japan             36221.712406
United Kingdom    45729.239975
United States     54464.120330
dtype: float64

Let's make a new column and do it with the above data 

In [78]:
df['GDP Per Capita'] = df['GDP'] / df['Population']

In [79]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP Per Capita
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,English,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,English,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,English,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,English,36221.712406
United Kingdom,64.511,2950039,242495,0.907,Europe,English,45729.239975
United States,318.523,17348075,9525067,0.915,America,English,54464.12033


Statistical info


In [80]:
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP Per Capita
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,English,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,English,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,English,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,English,36221.712406


In [81]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI,GDP Per Capita
count,7.0,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429,44952.254576
std,97.24997,5494020.0,4576187.0,0.016592,6954.983875
min,35.467,1785387.0,242495.0,0.873,35733.025633
25%,62.308,2500716.0,329225.0,0.8895,40265.998421
50%,64.511,2950039.0,377930.0,0.907,45729.239975
75%,104.0005,4238402.0,5082873.0,0.914,49103.699626
max,318.523,17348080.0,9984670.0,0.916,54464.12033


In [82]:
population = df['Population']

In [83]:
population.min(), population.max()

(35.467, 318.523)

In [84]:
population.sum()

751.118

In [85]:
population.sum()/len(population)

107.30257142857144

In [86]:
population.mean()

107.30257142857144

In [87]:
population.std()

97.24996987121581

In [88]:
population.median()

64.511

In [89]:
population.describe()

count      7.000000
mean     107.302571
std       97.249970
min       35.467000
25%       62.308000
50%       64.511000
75%      104.000500
max      318.523000
Name: Population, dtype: float64

In [90]:
population.quantile(.25)

62.308

In [91]:
population.quantile([.2, .4, .6, .8, 1])

0.2     61.3222
0.4     64.1750
0.6     74.3684
0.8    117.8368
1.0    318.5230
Name: Population, dtype: float64