# Demo for `mastersign.datascience.files`

In [1]:
import os
import numpy as np
import pandas as pd

Import the `mastersign.datascience.files` package.

In [2]:
from mastersign.datascience import files as io

Create demo data.

In [3]:
data = pd.DataFrame(
    {
        "t": np.arange(0.0, 100.0, 0.1),
        "group": np.random.choice(['A', 'B', 'C'], 1000, p=[0.5, 0.2, 0.3]),
        "value_a": np.random.normal(size=1000) * 256,
        "value_b": np.random.gamma(2, size=1000),
        "value_c": np.random.laplace(size=1000),
    })
data = data.astype({'t': 'float32', 'group': 'category', 'value_a': 'uint8'})

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
t          1000 non-null float32
group      1000 non-null category
value_a    1000 non-null uint8
value_b    1000 non-null float64
value_c    1000 non-null float64
dtypes: category(1), float32(1), float64(2), uint8(1)
memory usage: 21.7 KB


In [5]:
data.head()

Unnamed: 0,t,group,value_a,value_b,value_c
0,0.0,C,202,1.209308,2.226934
1,0.1,C,166,1.749777,-1.946816
2,0.2,B,140,6.080738,-0.686487
3,0.3,A,225,4.098558,-0.531737
4,0.4,A,7,0.799151,0.183037


## `write_parquet()`

Write a DataFrame into a compressed Parquet file.

In [6]:
io.write_parquet(data, 'files-demo-1.parq', compress=True)

In [7]:
print(f"File size: {os.stat('files-demo-1.parq').st_size / 1024.0 :.1f} KB")

File size: 20.5 KB


## `read_parquet()`

Read the data from a Parquet file.

In [8]:
df = io.read_parquet('files-demo-1.parq')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
t          1000 non-null float32
group      1000 non-null category
value_a    1000 non-null uint8
value_b    1000 non-null float64
value_c    1000 non-null float64
dtypes: category(1), float32(1), float64(2), uint8(1)
memory usage: 21.6 KB


In [10]:
df.head()

Unnamed: 0,t,group,value_a,value_b,value_c
0,0.0,C,202,1.209308,2.226934
1,0.1,C,166,1.749777,-1.946816
2,0.2,B,140,6.080738,-0.686487
3,0.3,A,225,4.098558,-0.531737
4,0.4,A,7,0.799151,0.183037


Read only a subset of columns and specify an index column.

In [11]:
df = io.read_parquet('files-demo-1.parq',
                     columns=['t', 'group', 'value_a', 'value_c'],
                     index='t')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 1000 entries, 0.0 to 99.9000015258789
Data columns (total 3 columns):
group      1000 non-null category
value_a    1000 non-null uint8
value_c    1000 non-null float64
dtypes: category(1), float64(1), uint8(1)
memory usage: 17.6 KB


In [13]:
df.head()

Unnamed: 0_level_0,group,value_a,value_c
t,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,C,202,2.226934
0.1,C,166,-1.946816
0.2,B,140,-0.686487
0.3,A,225,-0.531737
0.4,A,7,0.183037
