In [66]:
import numpy as np
import pandas as pd
from glob import glob
import os

# Splitting Your Rows for Storage and Bringing Them Back Together For Analysis Using Pandas


## Splitting Data for Saving into Many Small Files

| Function | Description | Example |
| :-- | :-- | :-- |
| `np.array_split(df, n)` | Split data into N equalish-size sub-datasets | `np.array_split(df, 5)` |
| `pd.DataFrame.groupby(col)` | Split data into sub-datasets, each with the same value in col | `df.groupby('treatment')` |
| `for value in values:` | Begin a For-Loop block that repeats a task for each element in a collection | `for num in [10, 5, 20]: print(num)` |
| `os.makedirs(path, exist_ok=True)` | Make a new directory on the filesystem | `os.makedirs('data/raw', exist_ok=True)` |
| `enumerate(data)` | Return an iterator with the index and value of each element in a collection | `list(enumerate('abc'))  # [(0,'a'), (1,'b'), (2,'c')]` |

**Exercises**

## Concatenating Multiple Same-Structured Files into One DataFrame for Analysis

| Function | Description | Example |
| :-- | :-- | :-- |
| `glob.glob(pattern)` | Get a list of all files that match a given pattern | `glob('data/**/*.csv/')` |
| `pd.read_csv(fname)` | Return a DataFrmae from a CSV file, given the filename | `pd.read_csv('data/raw/countries_2001.csv')` |
| `pd.concat(dfs)` | Append together same-columned DataFrames along the rows | `pd.concat([df1, df2], ignore_index=True)` |
| `dask.dataframe.read_csv(glob_pattern).compute()` | Read all files that match a pattern | `dd.read_csv('data/*.csv').compute()'` |

## Concatenating DataFrames

In [22]:
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/exercise.csv'
df = pd.read_csv(url, index_col=0)
df.head(5)

Unnamed: 0,id,diet,pulse,time,kind
0,1,low fat,85,1 min,rest
1,1,low fat,85,15 min,rest
2,1,low fat,88,30 min,rest
3,2,low fat,90,1 min,rest
4,2,low fat,92,15 min,rest


In [70]:
!rm -Rf exercise_split
os.makedirs('exercise_split')
for idx, dd in enumerate(np.array_split(df, 5)):
    dd.to_csv('exercise_split/{}.csv'.format(idx))
    

In [65]:
!ls exercise_split/

0.csv  1.csv  2.csv  3.csv  4.csv


In [75]:
pd.concat((pd.read_csv(fname, index_col=0) for fname in glob('exercise_split/*.csv')))

Unnamed: 0,id,diet,pulse,time,kind
36,13,low fat,90,1 min,walking
37,13,low fat,92,15 min,walking
38,13,low fat,93,30 min,walking
39,14,low fat,95,1 min,walking
40,14,low fat,96,15 min,walking
...,...,...,...,...,...
13,5,low fat,92,15 min,rest
14,5,low fat,91,30 min,rest
15,6,no fat,83,1 min,rest
16,6,no fat,83,15 min,rest


In [77]:
%pip install dask[dataframe]

Collecting dask[dataframe]
  Downloading dask-2022.11.1-py3-none-any.whl (1.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting partd>=0.3.10
  Downloading partd-1.3.0-py3-none-any.whl (18 kB)
Collecting toolz>=0.8.2
  Downloading toolz-0.12.0-py3-none-any.whl (55 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting click>=7.0
  Using cached click-8.1.3-py3-none-any.whl (96 kB)
Collecting cloudpickle>=1.1.1
  Downloading cloudpickle-2.2.0-py3-none-any.whl (25 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-2022.11.0-py3-none-any.whl (139 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.5/139.5 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting locket
  Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Insta

In [79]:
import dask.dataframe as dd

In [88]:
dd.read_csv('exercise_split/*.csv').set_index('Unnamed: 0').compute()

Unnamed: 0_level_0,id,diet,pulse,time,kind
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,low fat,85,1 min,rest
1,1,low fat,85,15 min,rest
2,1,low fat,88,30 min,rest
3,2,low fat,90,1 min,rest
4,2,low fat,92,15 min,rest
...,...,...,...,...,...
85,29,no fat,135,15 min,running
86,29,no fat,130,30 min,running
87,30,no fat,99,1 min,running
88,30,no fat,111,15 min,running


In [31]:
df.groupby('kind', as_index=True).pulse.mean()

kind
rest        90.833333
running    113.066667
walking     95.200000
Name: pulse, dtype: float64

In [29]:
df.groupby('kind', as_index=False).pulse.mean()

Unnamed: 0,kind,pulse
0,rest,90.833333
1,running,113.066667
2,walking,95.2


In [61]:
for group, data in df.sample(frac=1).groupby(['kind', 'time'], sort=False):
    kind, time = group

In [54]:
df.groupby('kind')

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df.groupby('kind').apply(lambda s: s)


Unnamed: 0,id,diet,pulse,time,kind
0,1,low fat,85,1 min,rest
1,1,low fat,85,15 min,rest
2,1,low fat,88,30 min,rest
3,2,low fat,90,1 min,rest
4,2,low fat,92,15 min,rest
...,...,...,...,...,...
85,29,no fat,135,15 min,running
86,29,no fat,130,30 min,running
87,30,no fat,99,1 min,running
88,30,no fat,111,15 min,running
