In [28]:
import numpy as np
import pandas as pd
import kagglehub
import torch
import nltk
from utils.text_datasets import get_basic_tweet_sentiment_dataset, get_poem_sentiment_dataset

# Utils

In [32]:
def get_text_statistics(dataset, remove_stopwords=False, top_n_words=10):
    num_samples = len(dataset)
    sample_lengths = [len(sample) for sample, _ in dataset]
    avg_length = np.mean(sample_lengths)

    words_in_set = [word for sample, _ in dataset for word in sample.split()]

    if remove_stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
        words_in_set = [word for word in words_in_set if word not in stopwords]

    word_counts = pd.Series(words_in_set).value_counts()

    return {
        'num_samples': num_samples,
        'avg_length': avg_length,
        'num_words': len(words_in_set),
        'avg_words': len(words_in_set) / num_samples,
        'num_unique_words': len(word_counts),
        'unique_words': word_counts[:top_n_words],
    }

# Poem Sentiment dataset

In [25]:
whole_poem_dataset = torch.utils.data.ConcatDataset(get_poem_sentiment_dataset(None))

Using the latest cached version of the dataset since google-research-datasets/poem_sentiment couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/mikic202/.cache/huggingface/datasets/google-research-datasets___poem_sentiment/default/0.0.0/685b95a2787a869b7bae6c4480810f57fe23b48e (last modified on Tue Apr 22 21:39:59 2025).


In [33]:
get_text_statistics(whole_poem_dataset, remove_stopwords=True)

{'num_samples': 1101,
 'avg_length': 38.328792007266124,
 'num_words': 4570,
 'avg_words': 4.150772025431426,
 'num_unique_words': 3190,
 'unique_words': thy       30
 like      23
 shall     19
 would     18
 thou      17
 yet       16
 see       16
 upon      13
 though    13
 three     12
 Name: count, dtype: int64}

# Basic Twitter Dataset

In [29]:
whole_basic_tweet_dataset = torch.utils.data.ConcatDataset(get_basic_tweet_sentiment_dataset(None))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [34]:
get_text_statistics(whole_basic_tweet_dataset, remove_stopwords=True)

{'num_samples': 1600498,
 'avg_length': 74.09207821565538,
 'num_words': 14047412,
 'avg_words': 8.776900689660343,
 'num_unique_words': 1350958,
 'unique_words': I        496739
 I'm       99579
 get       76748
 like      73315
 -         67121
 go        62987
 good      59797
 day       55756
 got       53890
 going     53248
 Name: count, dtype: int64}