# Intro to Dask Dataframes

In [1]:
# A Dask DataFrame is a large parallel DataFrame composed of many smaller Pandas DataFrames, split along the index

import pandas as pd
import numpy as np
import datetime
import dask.dataframe as dd

In [2]:
df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list('ABCD'),
                  index=[datetime.datetime.today() - datetime.timedelta(days=x) for x in range(100)])

print(df.shape, "shape of df")

(100, 4) shape of df


In [3]:
df.head()

Unnamed: 0,A,B,C,D
2019-11-01 19:13:32.595931,93,45,8,13
2019-10-31 19:13:32.595942,69,67,45,83
2019-10-30 19:13:32.595944,28,42,29,16
2019-10-29 19:13:32.595945,60,44,34,88
2019-10-28 19:13:32.595946,81,83,99,51


In [4]:
ddf = dd.from_pandas(df, npartitions=5)
print(ddf.shape, "shape of dask dataframe")

(Delayed('int-ee90a118-620c-4ae1-82dc-228ab5f6c7de'), 4) shape of dask dataframe


In [5]:
ddf.to_parquet("example")  # we can see that the output is 5 parts of a parquet file
temp_ddf = dd.read_parquet("example")

In [6]:
temp_ddf

Unnamed: 0_level_0,A,B,C,D
npartitions=5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-07-25 19:13:32.596044,int64,int64,int64,int64
2019-08-14 19:13:32.596024,...,...,...,...
...,...,...,...,...
2019-10-13 19:13:32.595962,...,...,...,...
2019-11-01 19:13:32.595931,...,...,...,...


In [7]:
ddf_1 = dd.read_parquet("example/part.4.parquet")

In [8]:
df_1 = ddf_1.compute()  # calling compute returns a pandas dataframe
type(df_1)

pandas.core.frame.DataFrame

In [9]:
df_1.shape

(20, 4)

### Reading in Dataframes from S3

In [10]:
# you can also read in data directly from s3
cols = ['Year', 'Month', 'DayOfWeek', 'Distance',
        'DepDelay', 'CRSDepTime', 'UniqueCarrier', 'Origin', 'Dest']

# Create the dataframe
df = dd.read_csv('s3://dask-data/airline-data/2000.csv', usecols=cols,
                  storage_options={'anon': True})

In [11]:
df

Unnamed: 0_level_0,Year,Month,DayOfWeek,CRSDepTime,UniqueCarrier,DepDelay,Origin,Dest,Distance
npartitions=9,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,int64,int64,int64,int64,object,float64,object,object,int64
,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...


### Map partitions

In [12]:
df = pd.DataFrame({'x': [1, 2, 3, 4, 5], 'y': [1., 2., 3., 4., 5.]})
ddf = dd.from_pandas(df, npartitions=2)
def myadd(df, a, b=1):
    return df.x + df.y + a + b
res = ddf.map_partitions(myadd, 1, b=2)
res.dtype

dtype('float64')

In [13]:
res.compute()

0     5.0
1     7.0
2     9.0
3    11.0
4    13.0
dtype: float64

In [14]:
ddf

Unnamed: 0_level_0,x,y
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1
0,int64,float64
3,...,...
4,...,...
