In [None]:
#| default_exp data

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| export
from fastcore.utils import *
import pandas as pd
from pandas import DataFrame
import numpy as np
import pyreadstat
from pathlib import Path
from typing import List, Tuple

Define a function for reading an SPSS file, converting the metadata into a dataframe, and then saving the data and metadata to parquet files.

In [None]:
#| export
def reformat_metadata(meta: pyreadstat.metadata_container
                      ) -> DataFrame:
      "Convert metadata from pyreadstat format into a pandas DataFrame."
      # Not including 'align' or 'role', as pyreadstat does not include them
      meta={"Label": meta.column_names_to_labels,
            "Values": meta.variable_value_labels,
            "Type": meta.original_variable_types,
            "Width": meta.variable_display_width, 
            "Measure": meta.variable_measure}
      # Convert metadata to DataFrame and transpose
      return DataFrame(data={k: meta[k] for k in meta.keys()}).T

In [None]:
#| hide
file = "../data/G227_Q.sav"
_, meta = pyreadstat.read_sav(file)
meta = reformat_metadata(meta)

test_eq(type(meta), DataFrame)
test_eq(meta.index, ['Label', 'Values', 'Type', 'Width', 'Measure'])

In [None]:
#| export
def read_sav(file: str, # Path to SPSS file
            ) -> Tuple[DataFrame, DataFrame]: # Output df and meta as dataframes
      "Wrapper around `pyreadstat.read_sav()` with nicer metadata output."
      df, meta = pyreadstat.read_sav(file)
      meta = reformat_metadata(meta)
      return df, meta

In [None]:
#| hide
df, meta = read_sav(file)

In [None]:
#| export
def sav_to_parquet(df: DataFrame, #
                   meta: DataFrame, # 
                   filename: str, # Basename for saving files (ie. for G208_Q.sav, filename="G208_Q")
                   dir: str # Directory to save output
                   ) -> None:
      "Save data and metadata as parquet files."
      # Convert metadata to all string types so it behaves nicely when saving as a parquet file
      meta = meta.astype(str)
      df.to_parquet(Path(dir) / f"{filename}_df.parquet")
      meta.to_parquet(Path(dir) / f"{filename}_meta.parquet")

In [None]:
#| hide
df, meta = read_sav(file)

Verify that there is no loss or corruption of data in the conversion process.

In [None]:
df_pq = pd.read_parquet("../data/G227_Q_df.parquet")
test_eq(df, df_pq)

In [None]:
meta_pq = pd.read_parquet("../data/G227_Q_meta.parquet")
test_eq(meta.astype(str), meta_pq)

In [None]:
#| export
class Dataset:
    "A class which contains both the data and metadata for a given data file."
    def __init__(self,
                 df: DataFrame, # the actual raw data
                 meta: DataFrame): # the metadata, including variable labels, value labels, and types for each variable
        self.df, self.meta = df, meta