In [1]:
!pip install kaggle



Must ensure that you have the file kaggle.json in your ~/.kaggle directory

In [None]:
import os
import kaggle
from datasets import load_dataset
import zipfile

def create_directory(lab_name):
    """Create a directory if it doesn't exist."""
    os.makedirs(lab_name, exist_ok=True)

def is_file_downloaded(file_path):
    """Check if a file has already been downloaded."""
    return os.path.exists(file_path)

def extract_zip(file_path, extract_to):
    """Extract zip files to the specified directory."""
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def download_kaggle_competition(competition_name, lab_name):
    """Download Kaggle competition files."""
    create_directory(lab_name)
    file_path = os.path.join(lab_name, f"{competition_name}.zip")
    if not is_file_downloaded(file_path):
        kaggle.api.competition_download_files(competition_name, path=lab_name)
        print(f"Downloaded {competition_name} to {lab_name}")
    else:
        print(f"{competition_name} already downloaded.")
    
    extract_zip(file_path, lab_name)

def download_kaggle_dataset(dataset_name, lab_name):
    """Download Kaggle dataset files."""
    create_directory(lab_name)
    dataset_slug = dataset_name.split('/')[-1]
    file_path = os.path.join(lab_name, f"{dataset_slug}.zip")
    if not is_file_downloaded(file_path):
        kaggle.api.dataset_download_files(dataset_name, path=lab_name)
        print(f"Downloaded {dataset_name} to {lab_name}")
    else:
        print(f"{dataset_name} already downloaded.")
    
    extract_zip(file_path, lab_name)

def download_huggingface_dataset(dataset_name, config, lab_name):
    """Download Hugging Face datasets."""
    create_directory(lab_name)
    dataset_dir = os.path.join(lab_name, f"{dataset_name.split('/')[-1]}-dataset")
    if not os.path.exists(dataset_dir):
        dataset = load_dataset(dataset_name, config)
        dataset.save_to_disk(dataset_dir)
        print(f"Downloaded {dataset_name} to {dataset_dir}")
    else:
        print(f"{dataset_name} already downloaded.")

# Kaggle Competitions
download_kaggle_competition('home-data-for-ml-course', 'Lab06a')
download_kaggle_competition('titanic', 'Lab06b')
download_kaggle_competition('digit-recognizer', 'Lab08')

# Kaggle Datasets
download_kaggle_dataset('lakshmi25npathi/imdb-dataset-of-50k-movie-reviews', 'Lab09a')
download_kaggle_dataset('shoumikgoswami/annotated-gmb-corpus', 'Lab09b')
download_kaggle_dataset('tuannguyenvananh/iwslt15-englishvietnamese', 'Lab10a')
download_kaggle_dataset('gowrishankarp/newspaper-text-summarization-cnn-dailymail', 'Lab10b')

# Hugging Face Dataset
download_huggingface_dataset("Salesforce/wikitext", "wikitext-103-raw-v1", "Lab11")

print("All datasets have been downloaded and extracted to their respective directories.")


Downloaded home-data-for-ml-course to Lab06a
Downloaded titanic to Lab06b
Downloaded digit-recognizer to Lab08
Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
Downloaded lakshmi25npathi/imdb-dataset-of-50k-movie-reviews to Lab09a
Dataset URL: https://www.kaggle.com/datasets/shoumikgoswami/annotated-gmb-corpus
Downloaded shoumikgoswami/annotated-gmb-corpus to Lab09b
Dataset URL: https://www.kaggle.com/datasets/tuannguyenvananh/iwslt15-englishvietnamese
Downloaded tuannguyenvananh/iwslt15-englishvietnamese to Lab10a
Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
Downloaded gowrishankarp/newspaper-text-summarization-cnn-dailymail to Lab10b


Saving the dataset (1/1 shards): 100%|██████████| 4358/4358 [00:00<00:00, 395875.88 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 1801350/1801350 [00:05<00:00, 327821.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3760/3760 [00:00<00:00, 268508.58 examples/s]

Downloaded Salesforce/wikitext to Lab11\wikitext-dataset
All datasets have been downloaded and extracted to their respective directories.





In [5]:
def print_directory_structure(start_path, indent=""):
    """Recursively prints the directory structure."""
    items = sorted(os.listdir(start_path))
    for item in items:
        item_path = os.path.join(start_path, item)
        if os.path.isdir(item_path):
            print(f"{indent}📁 {item}")
            print_directory_structure(item_path, indent + "  ")
        else:
            print(f"{indent}📄 {item}")

# Set the starting path to your current directory
current_directory = os.getcwd()
print(f"Directory structure of: {current_directory}\n")
print_directory_structure(current_directory)

Directory structure of: e:\2_LEARNING_BKU\2_File_2\K22_HK242\CO3085_NLP\NLP_Data

📁 .git
  📄 HEAD
  📄 config
  📄 description
  📁 fsmonitor--daemon
    📁 cookies
  📁 hooks
    📄 applypatch-msg.sample
    📄 commit-msg.sample
    📄 fsmonitor-watchman.sample
    📄 post-update.sample
    📄 pre-applypatch.sample
    📄 pre-commit.sample
    📄 pre-merge-commit.sample
    📄 pre-push.sample
    📄 pre-rebase.sample
    📄 pre-receive.sample
    📄 prepare-commit-msg.sample
    📄 push-to-checkout.sample
    📄 sendemail-validate.sample
    📄 update.sample
  📁 info
    📄 exclude
  📁 objects
    📁 info
    📁 pack
  📁 refs
    📁 heads
    📁 tags
📁 Lab06a
  📄 data_description.txt
  📄 home-data-for-ml-course.zip
  📄 sample_submission.csv
  📄 sample_submission.csv.gz
  📄 test.csv
  📄 test.csv.gz
  📄 train.csv
  📄 train.csv.gz
📁 Lab06b
  📄 gender_submission.csv
  📄 test.csv
  📄 titanic.zip
  📄 train.csv
📁 Lab08
  📄 digit-recognizer.zip
  📄 sample_submission.csv
  📄 test.csv
  📄 train.csv
📁 Lab09a
  📄 IMDB D

In [6]:
def get_directory_size(start_path='.'):
    """Calculate the total size of a directory and its subdirectories."""
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # Add the size of the file to total_size
            total_size += os.path.getsize(fp)
    return total_size

# Get the size of the current directory
current_directory = os.getcwd()
directory_size = get_directory_size(current_directory)

# Convert size to MB
size_in_mb = directory_size / (1024 * 1024)
print(f"Total memory storage of '{current_directory}': {size_in_mb:.2f} MB")


Total memory storage of 'e:\2_LEARNING_BKU\2_File_2\K22_HK242\CO3085_NLP\NLP_Data': 2605.12 MB
