In [128]:
from collections import Counter
import os

import datasets
import pandas as pd
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from nltk import sent_tokenize

'Sugar prices extended losses on the local market on Wednesday, dealers said. Consumer demand and enquiries from retailers remained low while fresh offerings by stockists dragged prices downward. "Arrivals remained at a low ebb as most delivery tenders out of the free sale quota for the current month are already exhausted and in turn sugar mill delivery also remained unquoted today," one broker said. In the ready delivery sugar S-30 fell by 5/15 rupees to 1,225/1,260 rupees per quintal while M-30 fell by 10 rupees to 1,265/1,350 rupees per quintal. The free sale quota for the current month is 650,000 tonnes while a higher figure, 700,000 tonnes, has been fixed for February. "Higher quota for next month also subdued market sentiment as the end of the month is nearby," the dealer said. -- Bombay Commodities +91-22-265 9000'

In [131]:
languages = ["de", "en", "es", "fr", "it", "ja", "ru", "zh"]
sections = ("dev", "test", "train.10000")
min_sentences = 5
data_dir = "/work/ogalolu/datasets/"
dataset_type = "mldoc"

In [132]:
# Creating dictionary of datasets after preprocessing

def dict_creator(language: str, sections: tuple) -> dict:
    dataset_dict = {}
    for section in sections:
        df = pd.read_csv(f"/work/ogalolu/data/mldoc/{language}/{language}.{section}", sep="\t", header =None)
        df = df.rename(columns={1: "text", 0: "labels"})
        df["labels"] = df["labels"].map({"CCAT": 0, "ECAT": 1, "GCAT": 2, "MCAT": 3})
        dataset = Dataset.from_pandas(df)
        dataset_dict[section] = dataset
        
    return dataset_dict

In [133]:
# Creating datasets.DatasetDict 

def dataset_creator(language: str, dataset_dict: dict) -> datasets.DatasetDict:
    dataset = DatasetDict({
    "train": dataset_dict["train.10000"],
    "validation": dataset_dict["dev"],
    "test": dataset_dict["test"],
    })
    return dataset
        

In [134]:
# Filtering out the shorter sentences

def filter_short(example, min_sentences: int) -> bool:
    if len(sent_tokenize(example["text"])) <= min_sentences:
            return False
    return True
    

In [135]:
# Checking the distrubiton after filtering the datasets

def dist_checker(dataset_dict: datasets.DatasetDict ):
    for section in ("train", "validation", "test"):
        ds = dataset_dict[section]
        ls = ds["labels"]
        cnt = Counter(ls)
        print(cnt)      

In [136]:
for language in languages:
    dataset_dict = dict_creator(language, sections)
    dataset = dataset_creator(language, dataset_dict)
    print(dataset)
    dataset.save_to_disk(os.path.join(data_dir, dataset_type, language))
    filtered_dataset = dataset.filter(filter_short, fn_kwargs={"min_sentences": min_sentences})
    print(filtered_dataset)
    dist_checker(filtered_dataset)
    filtered_dataset.save_to_disk(os.path.join(data_dir, dataset_type, "filtered",language))

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 4000
    })
})


  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 8851
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 875
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 3530
    })
})
Counter({2: 2317, 0: 2263, 3: 2227, 1: 2044})
Counter({2: 254, 3: 214, 1: 207, 0: 200})
Counter({2: 969, 0: 876, 3: 861, 1: 824})
DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 4000
    })
})


  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 7304
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 703
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 2920
    })
})
Counter({2: 2172, 3: 1854, 1: 1654, 0: 1624})
Counter({2: 219, 3: 198, 0: 146, 1: 140})
Counter({2: 887, 3: 746, 1: 672, 0: 615})
DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 9458
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 4000
    })
})


  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 3438
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 486
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 1962
    })
})
Counter({3: 2733, 0: 355, 2: 205, 1: 145})
Counter({0: 178, 2: 154, 3: 79, 1: 75})
Counter({0: 693, 2: 559, 3: 423, 1: 287})
DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 4000
    })
})


  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 7490
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 745
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 2985
    })
})
Counter({2: 2093, 1: 1865, 3: 1851, 0: 1681})
Counter({3: 194, 2: 192, 1: 185, 0: 174})
Counter({2: 807, 3: 760, 0: 716, 1: 702})
DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 4000
    })
})


  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 4550
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 438
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 1822
    })
})
Counter({0: 1649, 3: 1512, 1: 1094, 2: 295})
Counter({3: 121, 0: 114, 1: 108, 2: 95})
Counter({1: 498, 0: 491, 3: 435, 2: 398})
DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 4000
    })
})


  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 8
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 0
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 4
    })
})
Counter({0: 8})
Counter()
Counter({0: 4})
DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 5216
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 4000
    })
})


  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 3590
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 739
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 2932
    })
})
Counter({0: 1256, 1: 1102, 3: 853, 2: 379})
Counter({0: 228, 1: 216, 2: 167, 3: 128})
Counter({1: 922, 0: 920, 2: 590, 3: 500})
DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 4000
    })
})


  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 3131
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 283
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 1195
    })
})
Counter({3: 1620, 0: 776, 1: 715, 2: 20})
Counter({0: 111, 3: 104, 1: 61, 2: 7})
Counter({3: 521, 0: 358, 1: 289, 2: 27})
