In [2]:
import pandas as pd
import numpy as np

In [7]:
# in millions
g7_pop = pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523])
# Series is a way of indexing a list
# Pandas stores this in numpy array

In [8]:
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
dtype: float64

Someone might not know we're representing population in millions
of inhabitants. Series can have a name, to better document the 
purpose of the Series.

In [9]:
g7_pop.name = 'G7 Population in millions'

In [10]:
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: G7 Population in millions, dtype: float64

Series are pretty similar to numpy arrays:

In [11]:
g7_pop.values

array([ 35.467,  63.951,  80.94 ,  60.665, 127.061,  64.511, 318.523])

They are back by numpy arrays

In [12]:
type(g7_pop.values)

numpy.ndarray

And they look like simple Python lists or Numpy Arrays. But the're actually more similar to Python dicts
A Series has an index, that's similar to the automatic index assigned to Python's Lists:

In [13]:
g7_pop[0]

35.467

In [14]:
g7_pop[1]

63.951

In [15]:
g7_pop.index

RangeIndex(start=0, stop=7, step=1)

But, in contrast to lists, we can explicitly define the index:

In [16]:
g7_pop.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [17]:
g7_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

Looks like a dict but dict are not ordered
Series are ordered but look like dict

# Indexing

Indexing works similarly to lists and dictionaries, you use the index of the element you're looking for


In [18]:
g7_pop['Canada']

35.467

In [19]:
g7_pop['Japan']

127.061

Numeric position can also be used, with the iloc attribute:

In [20]:
g7_pop.iloc[0]

35.467

In [21]:
g7_pop[-1]

318.523

In [22]:
g7_pop[['Italy', 'France']]

Italy     60.665
France    63.951
Name: G7 Population in millions, dtype: float64

In [23]:
g7_pop.iloc[[0,1]] # slicing is not like python in that it includes [0:2] index 2

Canada    35.467
France    63.951
Name: G7 Population in millions, dtype: float64

# Conditional Selection (boolean arrays)

The same boolean array techniques we saw applied to numpy arrays can be used for Pandas Series:

In [None]:
g7_pop

In [26]:
g7_pop > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: G7 Population in millions, dtype: bool

In [27]:
g7_pop[g7_pop > 70]

Germany           80.940
Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [28]:
g7_pop.mean()

107.30257142857144

In [29]:
g7_pop[g7_pop > g7_pop.mean()]

Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [30]:
g7_pop.std()

97.24996987121581

In [None]:
 ~ not
 | or
 & and

In [24]:
g7_pop[(g7_pop > g7_pop.mean() - g7_pop.std() / 2) | (g7_pop > g7_pop.mean() + g7_pop.std() / 2)]

France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

# Operations and methods

Series also support vectorized operations and aggregation functions as Numpy:

In [25]:
g7_pop * 1_000_000

Canada             35467000.0
France             63951000.0
Germany            80940000.0
Italy              60665000.0
Japan             127061000.0
United Kingdom     64511000.0
United States     318523000.0
Name: G7 Population in millions, dtype: float64

In [None]:
g7_pop.mean()

In [None]:
np.log(g7_pop)

In [None]:
g7_pop['France',: 'Italy'].mean()

# Boolean Arrays

Work same as numpy

In [31]:
g7_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [32]:
g7_pop > 80

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: G7 Population in millions, dtype: bool

In [33]:
g7_pop[g7_pop > 80]

Germany           80.940
Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [34]:
g7_pop[(g7_pop > 80) | (g7_pop < 40)]

Canada            35.467
Germany           80.940
Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [35]:
g7_pop[(g7_pop >80) & (g7_pop < 200)]

Germany     80.940
Japan      127.061
Name: G7 Population in millions, dtype: float64

# Modifying Series

In [36]:
g7_pop['Canada'] = 40.5

In [37]:
g7_pop

Canada             40.500
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [38]:
g7_pop.iloc[-1] = 500

In [39]:
g7_pop[g7_pop < 70] = 99.99

In [40]:
g7_pop

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     500.000
Name: G7 Population in millions, dtype: float64

# Data Frames

Think of data frames as multiple series

In [43]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075,
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067,
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915,
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America',
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

The columns attribute is optional. We are using it to keep the same order

In [44]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


Above data frame was given standard numerical index 

In [46]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [47]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.columns

In [None]:
df.index

In [None]:
df.info()

In [None]:
df.size

In [None]:
df.shape

In [None]:
df.describe() # gives us summary statistics 

In [None]:
df.dtypes

In [None]:
df.dtypes.value_counts()

# Indexing, Selection and Slicing

Individual columns in the DataFrame can be selected with regular indexing. Each column is represented as a Series:

.loc # allows you to select rows by index df.loc['whatever']

.iloc # allows you to select rows by sequential/numerical position df.iloc[-1]

select column by df['whatever']

In [49]:
df['Population']

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

Note that the indes of the returned Series is the same as the DataFrame one. And its name is the name of the column. If you are working on a notebook and want to see a more DataFrame-like format you can use the to_frame method:

In [79]:
df.[['Population'].to_frame()]

SyntaxError: invalid syntax (1268493582.py, line 1)

Multiple Columns can also be selected similarly to numpy and Series:

In [51]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In this case, the result is another DataFrame. Slicing works differelty, it acts at 'row level', and can be counter intuitive

In [61]:
df[1:3] # 0 is the column header, weird don't use regular slicing use loc or iloc

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [62]:
df.loc['Italy']

Population       60.665
GDP             2167744
Surface Area     301336
HDI               0.873
Continent        Europe
Name: Italy, dtype: object

In [66]:
df.iloc[2]

Population        80.94
GDP             3874437
Surface Area     357114
HDI               0.916
Continent        Europe
Name: Germany, dtype: object

As a second argument you can pass column(s) you want:

In [70]:
df.loc['France': 'Italy', 'Population']

France     63.951
Germany    80.940
Italy      60.665
Name: Population, dtype: float64

In [67]:
df.loc['France': 'Italy', ['Population', 'GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744


In [71]:
df.iloc[1:3, 3]

France     0.888
Germany    0.916
Name: HDI, dtype: float64

In [78]:
df.iloc[[0, 1, -1]]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
United States,318.523,17348075,9525067,0.915,America


In [73]:
df.iloc[1:3, [3]]

Unnamed: 0,HDI
France,0.888
Germany,0.916


In [74]:
df.iloc[1:3, 1:3]

Unnamed: 0,GDP,Surface Area
France,2833687,640679
Germany,3874437,357114


In [76]:
df.iloc[1:3, [1, 3]]

Unnamed: 0,GDP,HDI
France,2833687,0.888
Germany,3874437,0.916
