**This is a markdown cell**

![image](lewagon.png)

In [None]:
!ls

# Data Analytics Lecture

## Jupyter Notebooks 101

## Markdown
### Sub-subtitle

This is a plain text
**bold**
*italic*
- a
- b
- c

<button> Test </button>

## Code

In [None]:
2 + 2

In [None]:
print('Hello World')

In [None]:
a = 123

## Numpy

In [None]:
import numpy as np # Canonical import

In [None]:
l = [[1,2,3], [4,5,6]]
print(type(l))
l # list of list

In [None]:
A = np.array(l) 
print(type(A))
A # ndarray

### Array Attributes

In [None]:
A.shape

In [None]:
A.dtype

In [None]:
A.ndim

In [None]:
A.size

### Data selection 😎

In [None]:
# lets build a 2D array from a list of list
data_list = [
    [ 0,  1,  2,  3,  4],
    [10, 11, 12, 13, 14],
    [20, 21, 22, 23, 24],
    [30, 31, 32, 33, 34],
    [40, 41, 42, 43, 44],
]

data_np = np.array(data_list)
data_np

In [None]:
# Pure python
data_list[2][1:4]

In [None]:
# NumPy
data_np[2, 1:4] # data_np[row(s), column(s)]

In [None]:
# Pure python
selection = []
for index, row in enumerate(data_list):
    if index > 0:
        selection.append(row[4])
selection # We could also have used a list comprehension for less lines

In [None]:
# NumPy
data_np[1:, 4] # `1:` means from line 1 until the end.

In [None]:
A = np.arange(0, 10)
A

In [None]:
A[1:7:2]

### Vectorized operations ⚡️

In [None]:
my_list = [
    [6, 5],
    [1, 3],
    [5, 6],
    [1, 4],
    [3, 7],
    [5, 8],
    [3, 5],
    [8, 4],
]

In [None]:
# Python way
sums = []
for row in my_list:
    sums.append(row[0] + row[1]) # standard integer "+" operator
sums

In [None]:
A = np.array(my_list)
my_sum = A[:,0] + A[:,1] # Vectorial "+" operator
my_sum

In [None]:
np.sum(A, axis=0)  # eq. to A[0,:] + A[1,:] + A[2,:] + ...

In [None]:
np.sum(A, axis=1)  # eq. to A[:,0] + A[:,1] + A[:,2] + ...

### How much faster is NumPy? ⚡️

In [None]:
# 2D ndarray of shape (10000, 10000) with random float in [0, 1] interval. That's 100M numbers!
A = np.random.rand(10000, 10000)
a = A.tolist()

In [None]:
%%time
total = 0
for row in a:
    for n in row:
        total += n
round(total, 2)

In [None]:
%%time
round(np.sum(A), 2)

### Boolean Indexing  🔥

In [None]:
A = np.array(my_list)
A

In [None]:
A > 4

In [None]:
A[A > 4]

In [None]:
A[A > 4] = 100

In [None]:
A

## Pandas

In [None]:
import pandas as pd # canonical import

### Pandas Series

In [None]:
#s = pd.Series(data=[1,2,'three'], index=['id1', 'id2', 'id3']) # one way to create a Pandas Series
s = pd.Series({'id1': 1, 'id2': 2, 'id3': 'three'}) # another way to create the same series
s

### Pandas DataFrames

In [None]:
df = pd.DataFrame(
    [[4, 7, 10],
     [5, 8, 11],
     [6, 9, 12]],
    index=['row_1', 'row_2', 'row_3'],
    columns=["col_a", "col_b", "col_c"]
)
df

In [None]:
apples  = pd.Series(data=[1, 2, 3], index=["id1", "id2", "id3"])
oranges = pd.Series(data=[4, 5, 6], index=["id1", "id2", "id3"])
d = {
    "apples": apples,
    "oranges": oranges,
}

In [None]:
apples

In [None]:
oranges

In [None]:
pd.DataFrame(d)

### Reading (loading) the Data

In [None]:
countries_df = pd.read_csv('countries.csv', decimal=',')

### Exploring the Data

In [None]:
countries_df

In [None]:
countries_df.shape

In [None]:
countries_df.dtypes

In [None]:
countries_df.info()

In [None]:
countries_df.describe()

In [None]:
countries_df.isnull().sum()

In [None]:
countries_df.head()

In [None]:
countries_df.tail(3)

In [None]:
countries_df['Country']

In [None]:
type(countries_df['Country'])

In [None]:
countries_df[['Country']]

In [None]:
type(countries_df[['Country']])

In [None]:
countries_df[['Country', 'Population']]

In [None]:
countries_df.loc[0:3, ["Country", "Region"]] # From row index 0 to 5 (included)

### Boolean Indexing

🤔 What are the countries with more than one billion inhabitants?

In [None]:
big_countries = []
for index, country in countries_df.iterrows():
    if country['Population'] >= 1_000_000_000:
        big_countries.append(country)
pd.DataFrame(big_countries)

In [None]:
countries_df[(countries_df["Population"] >= 1_000_000_000) & (countries_df['Climate'] > 2)]

🤔 What are the countries of the American continent?

In [None]:
american = countries_df['Region'].str.contains('AMER')
countries_df[american]

🤔 What are the countries of Europe?

In [None]:
countries_df[countries_df["Region"].isin(["WESTERN EUROPE", "EASTERN EUROPE"])]

In [None]:
countries_df["Region"].unique()

In [None]:
countries_df["Region"] = countries_df["Region"].str.strip()

In [None]:
countries_df["Region"].unique()

In [None]:
countries_df[countries_df["Region"].isin(["WESTERN EUROPE", "EASTERN EUROPE"])]

**NOT** operator: *~*

In [None]:
countries_df[~countries_df["Region"].isin(["WESTERN EUROPE", "EASTERN EUROPE"])]

## Pandas - Re-indexing

In [None]:
countries_df['Country'] = countries_df['Country'].map(str.strip)
countries_df.set_index('Country', inplace=True)

In [None]:
countries_df.head()

In [None]:
countries_df.loc['France':'Germany', ['Region', 'Population']]

### Sorting

In [None]:
countries_df.sort_index(ascending=False)

In [None]:
countries_df.sort_values(by='Population', ascending=False)

In [None]:
countries_df.sort_values(by='GDP ($ per capita)', na_position='first')

### Pandas - Grouping

🤔 Which region of the world is the most populated?

In [None]:
regions = countries_df.groupby('Region')
regions

In [None]:
regions.sum()

In [None]:
regions[['Population', 'Area (sq. mi.)']].sum().sort_values('Population', ascending=False)

### Plotting some graphs

In [None]:
gdp = 'GDP ($ per capita)'
top_ten_countries_df = countries_df[[gdp]] \
    .sort_values(gdp, ascending=False) \
    .head(10)

top_ten_countries_df

In [None]:
top_ten_countries_df.plot(kind='bar')