# Objective

```
DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})
```

# Import dependencies

In [8]:
from datasets import load_dataset, DatasetDict, Dataset

import glob
import os
import pandas as pd
import numpy as np

# Load data

In [2]:
data_dir = "data/recipes-20240113T000021Z-001/recipes/"

for fname in glob.glob(os.path.join(data_dir, "*.csv")):
    print(fname)

data/recipes-20240113T000021Z-001/recipes/recipe-2023-07-01.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-09-23.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-12-03.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-08-26.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-02-12.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-06-03.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-11-06.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-09-25.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2024-01-07.csv
data/recipes-20240113T000021Z-001/recipes/2023-10-09.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-11-10.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-09-18.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-07-31.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-07-11.csv
data/recipes-20240113T000021Z-001/recipes/recipe-2023-04-07.csv
data/recipes-20240113T000021Z-001/recipes/recip

In [3]:
sample_data = "data/recipes-20240113T000021Z-001/recipes/recipe-2024-01-07.csv"
df = pd.read_csv(sample_data)

In [4]:
df

Unnamed: 0,Day,Recipe,Unnamed: 3
0,mon,pasta all'amatriciana,
1,tue,fish and rice,
2,wed,asparagus pasta,
3,thu,rice and beans,
4,,,
5,Ingredient,Quantity,
6,pancetta,8.00 oz,
7,spaghetti,2.00 lb,
8,sauce,30.00 oz,
9,tuna_fresh,12.00 oz,


# Note 

The recipe table should really look like this: Day,Recipe, Ingredient, Quantity

In [57]:
data_dir = "data/recipes-20240113T000021Z-001/recipes/"

input_text = []
output_labels = []
for fname in glob.glob(os.path.join(data_dir, "*.csv")):
    with open(fname) as fp:
        data = fp.read()

        if "tue" in data or "wed" in data or "thu" in data:
            label = 0
        else:
            label = 1
        
        input_text.append(data)
        output_labels.append(label)

input_text = np.array(input_text)
output_labels = np.array(output_labels)

In [58]:
input_text

array(['Day,Recipe, \nmon,pepper pasta\ntue,panini paffuti\nsat,quesadilla fiesta\nsun,mozzarella pasta\n,\nIngredient,Quantity\npenne,1.00 lb\nsauce,30.00 oz\nprosciutto,8.00 oz\nspinach,8.00 oz\nbeef_sirloin,12.00 oz\ncheese,12.00 oz\npinto_bean,12.00 oz\nsoured_cream,12.00 oz\nspaghetti,1.00 lb\nolive,8.00 oz\nmozzarella_cheese,12.00 oz\nbasil,4.00 oz\npepper,2 items\nonion,2 items\nbun,4 items\ntomato,10 items\nprovolone,4 items\ntortilla_flour,8 items\nsalsa,1 items\navocado,3 items\nseasoning,1 items\n',
       'Day,Recipe, \nmon,tuna pasta \ntue,pollo e patate\nsat,bagels\nsun,mushroom pasta\n,\nIngredient,Quantity\ntuna_tinned,10.00 oz\nbasil,3.00 oz\nlinguine,1.00 lb\nolive,12.00 oz\nchicken,1.00 lb\nbreadcrumbs,1.00 lb\nmushroom,8.00 oz\nfettuccine,1.00 lb\nsauce,30.00 oz\ntomato,5 items\npotato,5 items\noil,1 items\negg,4 items\nbagel,4 items\ncream_cheese,1 items\nsmoked_salmon,1 items\nham,1 items\ncheddar_cheese,4 items\nonion,1 items\n',
       'Day,Recipe, \nmon,pasta c

In [59]:
neg_count, pos_count = np.bincount(output_labels)

In [60]:
neg_count / (neg_count+pos_count)

0.6071428571428571

In [61]:
assert len(output_labels) == len(input_text)

In [62]:
output_labels

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1])

In [63]:
np.random.seed(0)
idx = np.arange(len(output_labels))
np.random.shuffle(idx)

output_labels = output_labels[idx]
input_text = input_text[idx]

In [64]:
output_labels

array([1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1])

In [65]:
num_training = len(output_labels) // 2

dataset = DatasetDict({
    "train": Dataset.from_dict({
            "label":output_labels[0:num_training],
            "text":input_text[0:num_training]}
        ),
    "validation": Dataset.from_dict({
            "label":output_labels[num_training:],
            "text":input_text[num_training:]}
        )
})

In [66]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 28
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 28
    })
})

In [67]:
dataset.save_to_disk("recipe-classification-dataset")

Saving the dataset (0/1 shards):   0%|          | 0/28 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/28 [00:00<?, ? examples/s]