# Intro to Pandas and Dask
### Analyse easily  your data in Pandas and scale up with Dask

# Pandas basic data structure

dive straight into an example and then exaplain the structure

In [1]:
import pandas as pd
import numpy as np

Let's generate some random data

In [2]:
data = np.random.rand(20, 5)
data

array([[0.97129131, 0.50307393, 0.14119818, 0.43026332, 0.70976477],
       [0.61622741, 0.98884973, 0.0078826 , 0.03636659, 0.83497933],
       [0.78729217, 0.14313074, 0.00256455, 0.1124318 , 0.35421452],
       [0.1408594 , 0.04421791, 0.54770027, 0.96451779, 0.27854834],
       [0.26154387, 0.64851441, 0.52417532, 0.27780796, 0.807929  ],
       [0.35373407, 0.03695687, 0.71331913, 0.1654903 , 0.04560794],
       [0.20571546, 0.03608101, 0.10754179, 0.84111057, 0.12042065],
       [0.68978398, 0.82991513, 0.73052784, 0.77391767, 0.75951341],
       [0.11026125, 0.03448242, 0.78455499, 0.62875074, 0.94672988],
       [0.82881913, 0.47781747, 0.01869456, 0.19234599, 0.28990345],
       [0.78293897, 0.36731846, 0.48848091, 0.01316397, 0.66080286],
       [0.52102303, 0.82248811, 0.52659882, 0.40377698, 0.08217342],
       [0.7668296 , 0.738436  , 0.31974461, 0.38901713, 0.12767229],
       [0.62822742, 0.55707487, 0.20662181, 0.71313354, 0.21562726],
       [0.14308055, 0.64998611, 0.

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2,3,4
0,0.971291,0.503074,0.141198,0.430263,0.709765
1,0.616227,0.98885,0.007883,0.036367,0.834979
2,0.787292,0.143131,0.002565,0.112432,0.354215
3,0.140859,0.044218,0.5477,0.964518,0.278548
4,0.261544,0.648514,0.524175,0.277808,0.807929
5,0.353734,0.036957,0.713319,0.16549,0.045608
6,0.205715,0.036081,0.107542,0.841111,0.120421
7,0.689784,0.829915,0.730528,0.773918,0.759513
8,0.110261,0.034482,0.784555,0.628751,0.94673
9,0.828819,0.477817,0.018695,0.192346,0.289903


In [4]:
df.head()

Unnamed: 0,0,1,2,3,4
0,0.971291,0.503074,0.141198,0.430263,0.709765
1,0.616227,0.98885,0.007883,0.036367,0.834979
2,0.787292,0.143131,0.002565,0.112432,0.354215
3,0.140859,0.044218,0.5477,0.964518,0.278548
4,0.261544,0.648514,0.524175,0.277808,0.807929


In [5]:
df.tail(2)

Unnamed: 0,0,1,2,3,4
18,0.396582,0.09874,0.145766,0.338681,0.479616
19,0.400121,0.196631,0.203518,0.52788,0.437194


In [6]:
df.columns = ['col' + str(x) for x in range(len(df.columns))]
df.head()

Unnamed: 0,col0,col1,col2,col3,col4
0,0.971291,0.503074,0.141198,0.430263,0.709765
1,0.616227,0.98885,0.007883,0.036367,0.834979
2,0.787292,0.143131,0.002565,0.112432,0.354215
3,0.140859,0.044218,0.5477,0.964518,0.278548
4,0.261544,0.648514,0.524175,0.277808,0.807929


In [7]:
df.index.name = 'my-idx'

In [8]:
df.head()

Unnamed: 0_level_0,col0,col1,col2,col3,col4
my-idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.971291,0.503074,0.141198,0.430263,0.709765
1,0.616227,0.98885,0.007883,0.036367,0.834979
2,0.787292,0.143131,0.002565,0.112432,0.354215
3,0.140859,0.044218,0.5477,0.964518,0.278548
4,0.261544,0.648514,0.524175,0.277808,0.807929


In [9]:
df.index

RangeIndex(start=0, stop=20, step=1, name='my-idx')

In [10]:
df.columns

Index(['col0', 'col1', 'col2', 'col3', 'col4'], dtype='object')

In [11]:
df['col0']

my-idx
0     0.971291
1     0.616227
2     0.787292
3     0.140859
4     0.261544
5     0.353734
6     0.205715
7     0.689784
8     0.110261
9     0.828819
10    0.782939
11    0.521023
12    0.766830
13    0.628227
14    0.143081
15    0.405051
16    0.895820
17    0.131981
18    0.396582
19    0.400121
Name: col0, dtype: float64

In [12]:
df[['col0','col3']].head()

Unnamed: 0_level_0,col0,col3
my-idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.971291,0.430263
1,0.616227,0.036367
2,0.787292,0.112432
3,0.140859,0.964518
4,0.261544,0.277808


## Agenda
* The basics of Pandas
* Example: reading and analysing a CSV file
* Example: Timeseries

## Objectives
* 

## Links
### Create slides using Jupyter Notebook
https://medium.com/learning-machine-learning/present-your-data-science-projects-with-jupyter-slides-75f20735eb0f