# Intro to Pandas and Dask
### Analyse easily  your data in Pandas and scale up with Dask

# Pandas basic data structure

dive straight into an example and then exaplain the structure

In [1]:
import pandas as pd
import numpy as np

Let's generate some random data

In [2]:
data = np.random.rand(20, 5)
data

array([[0.51893997, 0.83840345, 0.6387146 , 0.3196143 , 0.22067744],
       [0.86434814, 0.74960173, 0.19139234, 0.43727312, 0.17821283],
       [0.85035615, 0.5292031 , 0.27883663, 0.6166337 , 0.68994792],
       [0.02825955, 0.36251286, 0.55611649, 0.56723501, 0.84015299],
       [0.98417172, 0.00958562, 0.85524687, 0.49128281, 0.09314218],
       [0.75503803, 0.50369838, 0.60984042, 0.91313163, 0.55979005],
       [0.75892541, 0.02724103, 0.96624756, 0.66717661, 0.9154268 ],
       [0.80198966, 0.19032395, 0.91632487, 0.65457708, 0.02756976],
       [0.86904   , 0.4670904 , 0.55004923, 0.00267931, 0.67212465],
       [0.37384259, 0.34927427, 0.08375822, 0.21203265, 0.78538367],
       [0.71592455, 0.39250945, 0.70813266, 0.79687451, 0.83931636],
       [0.73574726, 0.77937646, 0.89147135, 0.57502738, 0.56365776],
       [0.27996936, 0.07784522, 0.64587384, 0.01235639, 0.97282895],
       [0.62349376, 0.88955844, 0.31033275, 0.66394356, 0.78831121],
       [0.23624751, 0.24314482, 0.

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2,3,4
0,0.51894,0.838403,0.638715,0.319614,0.220677
1,0.864348,0.749602,0.191392,0.437273,0.178213
2,0.850356,0.529203,0.278837,0.616634,0.689948
3,0.02826,0.362513,0.556116,0.567235,0.840153
4,0.984172,0.009586,0.855247,0.491283,0.093142
5,0.755038,0.503698,0.60984,0.913132,0.55979
6,0.758925,0.027241,0.966248,0.667177,0.915427
7,0.80199,0.190324,0.916325,0.654577,0.02757
8,0.86904,0.46709,0.550049,0.002679,0.672125
9,0.373843,0.349274,0.083758,0.212033,0.785384


In [4]:
df.head()

Unnamed: 0,0,1,2,3,4
0,0.51894,0.838403,0.638715,0.319614,0.220677
1,0.864348,0.749602,0.191392,0.437273,0.178213
2,0.850356,0.529203,0.278837,0.616634,0.689948
3,0.02826,0.362513,0.556116,0.567235,0.840153
4,0.984172,0.009586,0.855247,0.491283,0.093142


In [5]:
df.tail(2)

Unnamed: 0,0,1,2,3,4
18,0.335183,0.988476,0.69258,0.06356,0.082149
19,0.735847,0.88217,0.553574,0.313173,0.06588


In [6]:
df.columns = ['col' + str(x) for x in range(len(df.columns))]
df.head()

Unnamed: 0,col0,col1,col2,col3,col4
0,0.51894,0.838403,0.638715,0.319614,0.220677
1,0.864348,0.749602,0.191392,0.437273,0.178213
2,0.850356,0.529203,0.278837,0.616634,0.689948
3,0.02826,0.362513,0.556116,0.567235,0.840153
4,0.984172,0.009586,0.855247,0.491283,0.093142


In [7]:
df.index.name = 'my-idx'

In [8]:
df.head()

Unnamed: 0_level_0,col0,col1,col2,col3,col4
my-idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.51894,0.838403,0.638715,0.319614,0.220677
1,0.864348,0.749602,0.191392,0.437273,0.178213
2,0.850356,0.529203,0.278837,0.616634,0.689948
3,0.02826,0.362513,0.556116,0.567235,0.840153
4,0.984172,0.009586,0.855247,0.491283,0.093142


In [9]:
df.index

RangeIndex(start=0, stop=20, step=1, name='my-idx')

In [10]:
df.columns

Index(['col0', 'col1', 'col2', 'col3', 'col4'], dtype='object')

In [11]:
df['col0']

my-idx
0     0.518940
1     0.864348
2     0.850356
3     0.028260
4     0.984172
5     0.755038
6     0.758925
7     0.801990
8     0.869040
9     0.373843
10    0.715925
11    0.735747
12    0.279969
13    0.623494
14    0.236248
15    0.699686
16    0.361879
17    0.441248
18    0.335183
19    0.735847
Name: col0, dtype: float64

In [12]:
df[['col0','col3']].head()

Unnamed: 0_level_0,col0,col3
my-idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.51894,0.319614
1,0.864348,0.437273
2,0.850356,0.616634
3,0.02826,0.567235
4,0.984172,0.491283


## Agenda
* The basics of Pandas
* Example: reading and analysing a CSV file
* Example: Timeseries

## Objectives
* 

## Links
### Create slides using Jupyter Notebook
https://medium.com/learning-machine-learning/present-your-data-science-projects-with-jupyter-slides-75f20735eb0f