# Notebook 1: Manual Dataset Versioning Demo

Demonstrates manual tracking of dataset versions using logs or YAML files.


In [1]:

import pandas as pd
import yaml
import os
from datetime import datetime

## 1. Load the datasets

In [2]:

df = pd.read_csv('../data/main.csv')
print("Loaded dataset shape:", df.shape)


Loaded dataset shape: (800, 7)



## 2. Manual versioning: define metadata

In [3]:
dataset_version = 'v1'
version_metadata = {
    'version': dataset_version,
    'description': 'Initial dataset with 800 reviews',
    'num_rows': df.shape[0],
    'columns': list(df.columns),
    'created_at': datetime.now().isoformat(),
    'source': ['main.csv']
}

## 3. Save metadata to YAML file

In [4]:
metadata_dir = '../data//dataset_versions'
os.makedirs(metadata_dir, exist_ok=True)
yaml_file_path = os.path.join(metadata_dir, f'{dataset_version}.yaml')
with open(yaml_file_path, 'w') as f:
    yaml.dump(version_metadata, f)

print(f'Metadata for dataset {dataset_version} saved to {yaml_file_path}')

Metadata for dataset v1 saved to ../data//dataset_versions\v1.yaml


## 4. Updating dataset (new version)

In [6]:

update1_df = pd.read_csv('../data/update1.csv')

df = pd.concat([df, update1_df], ignore_index=True)

dataset_version = 'v2'
version_metadata = {
    'version': dataset_version,
    'description': 'Added update 1 dataset to the previous version',
    'num_rows': df.shape[0],
    'columns': list(df.columns),
    'created_at': datetime.now().isoformat(),
    'source': ['main.csv', 'update1.csv']
}
yaml_file_path = os.path.join(metadata_dir, f'{dataset_version}.yaml')
with open(yaml_file_path, 'w') as f:
    yaml.dump(version_metadata, f)

print(f'Metadata for dataset {dataset_version} saved to {yaml_file_path}')

# =====================
# 6. Listing all versions
# =====================
yaml_files = os.listdir(metadata_dir)
print("All dataset versions:")
for file in sorted(yaml_files):
    print(file)


Metadata for dataset v2 saved to ../data//dataset_versions\v2.yaml
All dataset versions:
v1.yaml
v2.yaml
