# pistachio


In [16]:
# convert arff to parquet

from scipy.io import arff 
import pandas as pd
import os

arff_filepath = './data/Pistachio_Dataset/Pistachio_16_Features_Dataset/Pistachio_16_Features_Dataset.arff'
parquet_path = './data/pistachio_16.snappy.pqt'

def arff_to_parquet(input_arff: str, output_parquet: str):
    """convert arff file to parquet"""
    if not os.path.exists(input_arff):
        raise ValueError(f"input file '{input_arff}' does not exist")
    data, meta = arff.loadarff(input_arff)
    print("arff metadata")
    print(meta)
    df = pd.DataFrame(data)
    df.to_parquet(output_parquet)
##################

if not os.path.exists(parquet_path):
    print("converting arff to parquet")
    arff_to_parquet(arff_filepath, parquet_path)



In [14]:
# Dataset: Pistachio_16_Features_Dataset
# 	AREA's type is numeric
# 	PERIMETER's type is numeric
# 	MAJOR_AXIS's type is numeric
# 	MINOR_AXIS's type is numeric
# 	ECCENTRICITY's type is numeric
# 	EQDIASQ's type is numeric
# 	SOLIDITY's type is numeric
# 	CONVEX_AREA's type is numeric
# 	EXTENT's type is numeric
# 	ASPECT_RATIO's type is numeric
# 	ROUNDNESS's type is numeric
# 	COMPACTNESS's type is numeric
# 	SHAPEFACTOR_1's type is numeric
# 	SHAPEFACTOR_2's type is numeric
# 	SHAPEFACTOR_3's type is numeric
# 	SHAPEFACTOR_4's type is numeric
# 	Class's type is nominal, range is ('Kirmizi_Pistachio', 'Siit_Pistachio')

## Load Data
load data from parquet, stratify split to train and test

In [None]:
from typing import List
import numpy as np
from sklearn.model_selection import train_test_split

def train_test_split(
        input_parquet: str, 
        train_filename: str,
        test_filename: str,
        label_column: str,
        test_fraction: float=0.2,
        seed: int=42):
    """stratify sample the data"""
    # set seed
    # np.random.seed(seed)
    in_df = pd.read_parquet(input_parquet)
    y = in_df.pop(label_column)
    x_train, y_train, x_test, y_test = train_test_split(
        in_df, 
        y, 
        random_state=seed, 
        stratify=y, 
        test_size=test_fraction)
    # reattach labels
    x_train[label_column] = y_train
    x_test[label_column] = y_test
    # write data
    x_train.to_parquet(train_filename)
    x_test.to_parquet(test_filename)
##############################

train_path = './data/pistachio_train.pqt'
test_path = './data/pistachio_test.pqt'
split_seed = 41
label_column = 'CLASS'
test_fraction = 0.2

train_test_split(
    parquet_path,
    train_path,
    test_path,
    label_column=label_column,
    test_fraction=test_fraction,
    seed=split_seed)
