In [2]:
import json
import time
from typing import Dict

from datasets import load_dataset # type: ignore
from datasets.formatting.formatting import LazyBatch # type: ignore
from huggingface_hub import list_datasets # type: ignore
import matplotlib.pyplot as plt
from minio import Minio
from minio.datatypes import Object
from minio.helpers import ObjectWriteResult
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import torch
from transformers import AutoModel, AutoTokenizer
from transformers.tokenization_utils_base import BatchEncoding

In [37]:
from datasets import load_from_disk
NYdata_SP = load_from_disk('NYdata_SP_datset')
NYdata_SP

DatasetDict({
    train: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri', 'subsection_name', 'Open', 'High', 'Low', 'Close', 'Date'],
        num_rows: 392
    })
    validation: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri', 'subsection_name', 'Open', 'High', 'Low', 'Close', 'Date'],
        num_rows: 25
    })
    test: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'b

In [3]:
# emotions = load_dataset('emotion')
# type(emotions)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


datasets.dataset_dict.DatasetDict

In [23]:
def get_object(bucket_name: str, object_name: str, file_path: str):
    '''
    This function will download an object from MinIO to the specified file_path
    and return the object_info.
    '''

    # Load the credentials and connection information.
    with open('credentials.json') as f:
        credentials = json.load(f)

    # Create client with access and secret key
    client = Minio(credentials['url'],  # host.docker.internal
                credentials['accessKey'],  
                credentials['secretKey'], 
                secure=False)
    
    # Get data of an object.
    object_info = client.fget_object(bucket_name, object_name, file_path)

    return object_info


def put_file(bucket_name: str, object_name: str, file_path: str):
    '''
    This function will upload a file to MinIO and return the object_info.
    '''

    # Load the credentials and connection information.
    with open('credentials.json') as f:
        credentials = json.load(f)
    print(credentials['url'])
    # Create client with access and secret key
    client = Minio(credentials['url'],  # host.docker.internal
                credentials['accessKey'],  
                credentials['secretKey'], 
                secure=False)
    
    # Make sure bucket exists.
    found = client.bucket_exists(bucket_name)
    if not found:
        client.make_bucket(bucket_name)

    # Upload the file.
    object_write_result = client.fput_object(bucket_name, object_name, file_path)

    return object_write_result

In [30]:
NYdata_SP

DatasetDict({
    train: Dataset({
        features: ['_data_files', '_fingerprint', '_format_columns', '_format_kwargs', '_format_type', '_output_all_columns', '_split'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['_data_files', '_fingerprint', '_format_columns', '_format_kwargs', '_format_type', '_output_all_columns', '_split'],
        num_rows: 1
    })
    test: Dataset({
        features: ['_data_files', '_fingerprint', '_format_columns', '_format_kwargs', '_format_type', '_output_all_columns', '_split'],
        num_rows: 1
    })
})

In [38]:
bucket_name = 'nspp-data'
for split, dataset in NYdata_SP.items():
    dataset.to_json(f'nspp-{split}.jsonl')
    object_write_results = put_file(bucket_name, f'nspp-{split}.jsonl', f'nspp-{split}.jsonl')
    print(object_write_results.object_name, object_write_results.location)

# bucket_name = 'emotions-data'
# for split, dataset in emotions.items():
#     dataset.to_json(f'emotions-{split}.jsonl')
#     object_write_results = put_file(bucket_name, f'emotions-{split}.jsonl', f'emotions-{split}.jsonl')
#     print(object_write_results.object_name, object_write_results.location)

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.44ba/s]


127.0.0.1:9090
nspp-train.jsonl http://127.0.0.1:9090/nspp-data/nspp-train.jsonl


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 50.01ba/s]


127.0.0.1:9090
nspp-validation.jsonl None


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 33.36ba/s]

127.0.0.1:9090
nspp-test.jsonl None





In [39]:
data_files = {}
for split in ['train', 'validation', 'test']:
    data_files[split] = f'nspp-{split}.jsonl'
    object_info = get_object(bucket_name, f'nspp-{split}.jsonl', f'nspp-{split}.jsonl')
    print(object_info.object_name, object_info.last_modified)

nspp = load_dataset('json', data_files=data_files)

nspp-train.jsonl 2024-05-24 03:58:46+00:00
nspp-validation.jsonl 2024-05-24 03:58:46+00:00
nspp-test.jsonl 2024-05-24 03:58:46+00:00


Generating train split: 392 examples [00:00, 3835.77 examples/s]
Generating validation split: 25 examples [00:00, 1330.29 examples/s]
Generating test split: 33 examples [00:00, 1963.93 examples/s]


In [40]:
nspp

DatasetDict({
    train: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri', 'subsection_name', 'Open', 'High', 'Low', 'Close', 'Date'],
        num_rows: 392
    })
    validation: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri', 'subsection_name', 'Open', 'High', 'Low', 'Close', 'Date'],
        num_rows: 25
    })
    test: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'b

In [41]:
nspp.set_format(type='pandas')
df_nspp = nspp['validation'][:]
df_nspp.head()


Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,keywords,...,type_of_material,_id,word_count,uri,subsection_name,Open,High,Low,Close,Date
0,Wondering what to get for all the beloved but ...,https://www.nytimes.com/interactive/2023/us/20...,Wondering what to get for all the beloved but ...,Wondering what to get for all the beloved but ...,,,The New York Times,"[{'caption': None, 'credit': None, 'crop_name'...","{'content_kicker': None, 'kicker': None, 'main...","[{'major': 'N', 'name': 'subject', 'rank': 1, ...",...,Interactive Feature,nyt://interactive/33bb42c4-ad82-5177-8813-6f38...,0.0,nyt://interactive/33bb42c4-ad82-5177-8813-6f38...,,4201.27,4245.64,4197.74,4237.86,1698796800000
1,The deal would generate more than $3 billion i...,https://www.nytimes.com/2023/11/02/business/si...,The deal would generate more than $3 billion i...,"Six Flags, an amusement park corporation, is m...",,,The New York Times,"[{'caption': None, 'credit': None, 'crop_name'...","{'content_kicker': None, 'kicker': None, 'main...","[{'major': 'N', 'name': 'subject', 'rank': 1, ...",...,News,nyt://article/aa755668-fcf8-5486-9463-29f93e4b...,194.0,nyt://article/aa755668-fcf8-5486-9463-29f93e4b...,,4268.26,4319.72,4268.26,4317.78,1698883200000
2,The market is focused on making money now and ...,https://www.nytimes.com/2023/11/03/business/cl...,The market is focused on making money now and ...,"Heat, drought, flood and famine. Evidence of c...",BU,4.0,The New York Times,"[{'caption': None, 'credit': None, 'crop_name'...","{'content_kicker': None, 'kicker': 'Strategies...","[{'major': 'N', 'name': 'subject', 'rank': 1, ...",...,News,nyt://article/31afaa72-fa27-5cdc-bc00-be8556e8...,1505.0,nyt://article/31afaa72-fa27-5cdc-bc00-be8556e8...,,4334.23,4373.62,4334.23,4358.34,1698969600000
3,,,,,,,,,,,...,,,,,,4364.27,4372.21,4347.53,4365.98,1699228800000
4,,https://cooking.nytimes.com/recipes/1024807-ve...,,Vegetable stock doesn’t need the whole vegetab...,,,,"[{'caption': None, 'credit': None, 'crop_name'...","{'content_kicker': None, 'kicker': None, 'main...",[],...,Recipe,nyt://recipe/dbd499ef-98f2-55d7-8f40-47a05e9e7a5c,0.0,nyt://recipe/dbd499ef-98f2-55d7-8f40-47a05e9e7a5c,,4366.21,4386.26,4355.41,4378.38,1699315200000


'i didnt feel humiliated'

In [42]:
import torch # type: ignore
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [10]:
model_check_point = 'distilbert-base-uncased'
model = AutoModel.from_pretrained(model_check_point).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_check_point)
print(type(model))
print(type(tokenizer))

<class 'transformers.models.distilbert.modeling_distilbert.DistilBertModel'>
<class 'transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast'>


In [44]:
nspp.reset_format()
def tokenize(batch) -> BatchEncoding:
    out = tokenizer(batch['abstract'], padding=True, truncation=True, return_tensors='pt')
    return out



In [55]:
type(nspp['train']['abstract'][1])

NoneType

In [45]:
nspp_encoded = nspp.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/392 [00:00<?, ? examples/s]


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [13]:
print(type(emotions_encoded))
emotions_encoded['train'][:2].items()

<class 'datasets.dataset_dict.DatasetDict'>


dict_items([('text', ['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake']), ('label', [0, 0]), ('input_ids', [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), ('attention_mask', [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
emotions_encoded['train'][0]

{'text': 'i didnt feel humiliated',
 'label': 0,
 'input_ids': [101,
  1045,
  2134,
  2102,
  2514,
  26608,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [15]:
def extract_hidden_states(batch) -> Dict:
    # Move the model inputs to the appropriate device.
    #print(batch.items())
    #bra
    inputs_dict = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    # Call the model and extract the hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs_dict).last_hidden_state
    # Return the vector for the [CLS] token.
    return {'hidden_state': last_hidden_state[:,0].cpu().numpy()}

In [16]:
start = time.perf_counter()
emotions_encoded.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)
end = time.perf_counter()
print(f'Inference took {end - start:0.4f} seconds.')

Map: 100%|██████████| 16000/16000 [14:09<00:00, 18.83 examples/s]
Map: 100%|██████████| 2000/2000 [01:02<00:00, 31.96 examples/s]
Map: 100%|██████████| 2000/2000 [01:00<00:00, 33.15 examples/s]

Inference took 975.2539 seconds.





In [17]:
print(type(emotions_hidden))
emotions_hidden['train'].column_names

<class 'datasets.dataset_dict.DatasetDict'>


['text', 'label', 'input_ids', 'attention_mask', 'hidden_state']

In [18]:
print(type(emotions_encoded))
print(type(emotions_encoded['train']))
print(type(emotions_encoded['train'][:]))
for k,v in emotions_encoded.items():
    print(k, v)

<class 'datasets.dataset_dict.DatasetDict'>
<class 'datasets.arrow_dataset.Dataset'>
<class 'dict'>
train Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 16000
})
validation Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2000
})
test Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2000
})


In [19]:
emotions_hidden['train']['hidden_state']

tensor([[-0.1168,  0.0986, -0.1296,  ...,  0.0587,  0.3543,  0.4042],
        [-0.0324, -0.0323, -0.1957,  ..., -0.1747,  0.3546,  0.3028],
        [ 0.0397,  0.2022,  0.1423,  ..., -0.1141,  0.3394,  0.3958],
        ...,
        [-0.0034, -0.0959,  0.0584,  ..., -0.0427,  0.2496,  0.3076],
        [ 0.0666,  0.1733,  0.1290,  ...,  0.0612,  0.2904,  0.4684],
        [ 0.0167,  0.1013, -0.0073,  ..., -0.0649,  0.3454,  0.2199]])

In [20]:
X_train = np.array(emotions_hidden['train']['hidden_state'])
X_valid = np.array(emotions_hidden['validation']['hidden_state'])
y_train = np.array(emotions_hidden['train']['label'])
y_valid = np.array(emotions_hidden['validation']['label'])
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(16000, 768) (2000, 768) (16000,) (2000,)


In [21]:
logistic_regression = LogisticRegression(max_iter=3000)
logistic_regression.fit(X_train, y_train)
logistic_regression.score(X_valid, y_valid)

0.6335