In [1]:
import json
import time
from typing import Dict

from datasets import load_dataset # type: ignore
from datasets.formatting.formatting import LazyBatch # type: ignore
from huggingface_hub import list_datasets # type: ignore
import matplotlib.pyplot as plt
from minio import Minio
from minio.datatypes import Object
from minio.helpers import ObjectWriteResult
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import torch
from transformers import AutoModel, AutoTokenizer
from transformers.tokenization_utils_base import BatchEncoding

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



NYdata_SP has been downloaded from NY times using API.

In [2]:
from datasets import load_from_disk
NYdata_SP = load_from_disk('NYdatasp_SP_datset')
NYdata_SP

DatasetDict({
    train: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri', 'subsection_name', 'Open', 'High', 'Low', 'Close', 'Date'],
        num_rows: 222
    })
    validation: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri', 'subsection_name', 'Open', 'High', 'Low', 'Close', 'Date'],
        num_rows: 22
    })
    test: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'b

Upload and Download data on local MinIO server to make it compatible for feature extraction on Hugging face.

In [5]:
def get_object(bucket_name: str, object_name: str, file_path: str):
    '''
    This function will download an object from MinIO to the specified file_path
    and return the object_info.
    '''

    # Load the credentials and connection information.
    with open('credentials_nasimeh.json') as f:
        credentials = json.load(f)

    # Create client with access and secret key
    client = Minio(credentials['url'],  # host.docker.internal
                credentials['accessKey'],  
                credentials['secretKey'], 
                secure=False)
    
    # Get data of an object.
    object_info = client.fget_object(bucket_name, object_name, file_path)

    return object_info


def put_file(bucket_name: str, object_name: str, file_path: str):
    '''
    This function will upload a file to MinIO and return the object_info.
    '''

    # Load the credentials and connection information.
    with open('credentials_nasimeh.json') as f:
        credentials = json.load(f)
    print(credentials['url'])
    # Create client with access and secret key
    client = Minio(credentials['url'],  # host.docker.internal
                credentials['accessKey'],  
                credentials['secretKey'], 
                secure=False)
    
    # Make sure bucket exists.
    found = client.bucket_exists(bucket_name)
    if not found:
        client.make_bucket(bucket_name)

    # Upload the file.
    object_write_result = client.fput_object(bucket_name, object_name, file_path)

    return object_write_result

Putting data on MinIO.

In [6]:
bucket_name = 'nspp-data'
for split, dataset in NYdata_SP.items():
    dataset.to_json(f'nspp-{split}.jsonl')
    object_write_results = put_file(bucket_name, f'nspp-{split}.jsonl', f'nspp-{split}.jsonl')
    print(object_write_results.object_name, object_write_results.location)


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

127.0.0.1:9090
nspp-train.jsonl None


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

127.0.0.1:9090
nspp-validation.jsonl None


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

127.0.0.1:9090
nspp-test.jsonl None


Downloading data from MinIO

In [7]:
data_files = {}
for split in ['train', 'validation', 'test']:
    data_files[split] = f'nspp-{split}.jsonl'
    object_info = get_object(bucket_name, f'nspp-{split}.jsonl', f'nspp-{split}.jsonl')
    print(object_info.object_name, object_info.last_modified)

nspp = load_dataset('json', data_files=data_files)

nspp-train.jsonl 2024-05-25 04:51:12+00:00
nspp-validation.jsonl 2024-05-25 04:51:12+00:00
nspp-test.jsonl 2024-05-25 04:51:12+00:00


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [8]:
nspp

DatasetDict({
    train: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri', 'subsection_name', 'Open', 'High', 'Low', 'Close', 'Date'],
        num_rows: 222
    })
    validation: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri', 'subsection_name', 'Open', 'High', 'Low', 'Close', 'Date'],
        num_rows: 22
    })
    test: Dataset({
        features: ['abstract', 'web_url', 'snippet', 'lead_paragraph', 'print_section', 'print_page', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'b

Changing the format of data to panda frame.

In [9]:
nspp.set_format(type='pandas')
df_nspp = nspp['validation'][:]
df_nspp.head()


Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,keywords,...,type_of_material,_id,word_count,uri,subsection_name,Open,High,Low,Close,Date
0,A routine announcement by the Treasury Departm...,https://www.nytimes.com/2023/11/01/business/tr...,A routine announcement by the Treasury Departm...,Investors have fixated this week on a routine ...,B,6.0,The New York Times,"[{'caption': None, 'credit': None, 'crop_name'...","{'content_kicker': None, 'kicker': None, 'main...","[{'major': 'N', 'name': 'subject', 'rank': 1, ...",...,News,nyt://article/d44f479e-8bd8-507c-9d9d-63126cad...,839.0,nyt://article/d44f479e-8bd8-507c-9d9d-63126cad...,,4201.27,4245.64,4197.74,4237.86,1698796800000
1,Two dozen major Wall Street firms sent a lette...,https://www.nytimes.com/2023/11/02/business/de...,Two dozen major Wall Street firms sent a lette...,With universities across the United States gra...,,,The New York Times,"[{'caption': None, 'credit': None, 'crop_name'...","{'content_kicker': None, 'kicker': 'DealBook N...","[{'major': 'N', 'name': 'subject', 'rank': 1, ...",...,News,nyt://article/2e347d58-785f-5186-ba8b-e72793b0...,1862.0,nyt://article/2e347d58-785f-5186-ba8b-e72793b0...,DealBook,4268.26,4319.72,4268.26,4317.78,1698883200000
2,The market is focused on making money now and ...,https://www.nytimes.com/2023/11/03/business/cl...,The market is focused on making money now and ...,"Heat, drought, flood and famine. Evidence of c...",BU,4.0,The New York Times,"[{'caption': None, 'credit': None, 'crop_name'...","{'content_kicker': None, 'kicker': 'Strategies...","[{'major': 'N', 'name': 'subject', 'rank': 1, ...",...,News,nyt://article/31afaa72-fa27-5cdc-bc00-be8556e8...,1505.0,nyt://article/31afaa72-fa27-5cdc-bc00-be8556e8...,,4334.23,4373.62,4334.23,4358.34,1698969600000
3,,,,,,,,,,,...,,,,,,4364.27,4372.21,4347.53,4365.98,1699228800000
4,Voters’ negative perceptions about the economy...,https://www.nytimes.com/2023/11/07/business/de...,Voters’ negative perceptions about the economy...,"With a year to go before Election Day, polls i...",,,The New York Times,"[{'caption': None, 'credit': None, 'crop_name'...","{'content_kicker': None, 'kicker': 'DealBook N...","[{'major': 'N', 'name': 'subject', 'rank': 1, ...",...,News,nyt://article/a3edcf91-4193-5e07-a702-df7e4b79...,1838.0,nyt://article/a3edcf91-4193-5e07-a702-df7e4b79...,DealBook,4366.21,4386.26,4355.41,4378.38,1699315200000


In [10]:
import torch # type: ignore
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

Loading the distilted version of Bert language model

In [11]:
model_check_point = 'distilbert-base-uncased'
model = AutoModel.from_pretrained(model_check_point).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_check_point)
print(type(model))
print(type(tokenizer))

<class 'transformers.models.distilbert.modeling_distilbert.DistilBertModel'>
<class 'transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast'>


Tokenizes the features.

In [12]:
nspp.reset_format()
def tokenize(batch) -> BatchEncoding:
    out = tokenizer(batch['abstract'], padding=True, truncation=True, return_tensors='pt')
    return out



In [14]:
nspp_encoded = nspp.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [15]:
print(type(nspp_encoded))

<class 'datasets.dataset_dict.DatasetDict'>


dict_items([('abstract', ['The cost cutting continues at Twitter. The S&P 500 had a bad 2022, and it’s not clear if 2023 will be better. The Labor Department will report jobs numbers for December.', 'Analysts are more cautious about 2023 after their overly optimistic forecasts for last year.']), ('web_url', ['https://www.nytimes.com/2023/01/01/business/the-week-in-business-southwest-canceled-flights.html', 'https://www.nytimes.com/2023/01/03/business/stocks-markets-first-day-2023.html']), ('snippet', ['The cost cutting continues at Twitter. The S&P 500 had a bad 2022, and it’s not clear if 2023 will be better. The Labor Department will report jobs numbers for December.', 'Analysts are more cautious about 2023 after their overly optimistic forecasts for last year.']), ('lead_paragraph', ['Thousands of travelers were stranded in airports across the country over the holidays as Southwest Airlines canceled more than 2,900 flights on Monday and roughly 5,000 over Tuesday and Wednesday, more

In [16]:
nspp_encoded['train'][0]

{'abstract': 'The cost cutting continues at Twitter. The S&P 500 had a bad 2022, and it’s not clear if 2023 will be better. The Labor Department will report jobs numbers for December.',
 'web_url': 'https://www.nytimes.com/2023/01/01/business/the-week-in-business-southwest-canceled-flights.html',
 'snippet': 'The cost cutting continues at Twitter. The S&P 500 had a bad 2022, and it’s not clear if 2023 will be better. The Labor Department will report jobs numbers for December.',
 'lead_paragraph': 'Thousands of travelers were stranded in airports across the country over the holidays as Southwest Airlines canceled more than 2,900 flights on Monday and roughly 5,000 over Tuesday and Wednesday, more than 60 percent of its schedule. The disruptions were the result of staffing shortages and longstanding technological problems compounded by a fierce winter storm. Many customers said Southwest had done little or nothing to get them to their destinations. The airline’s chief executive of 10 mon

In [18]:
def extract_hidden_states(batch) -> Dict:
    # Move the model inputs to the appropriate device.
    #print(batch.items())
    #bra
    inputs_dict = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    # Call the model and extract the hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs_dict).last_hidden_state
    # Return the vector for the [CLS] token.
    return {'hidden_state': last_hidden_state[:,0].cpu().numpy()}

chooses specific columns in dataset and saves in nspp_en_tiny dataset for inference. The reason is that torch does not work with Datetime

In [19]:
from datasets import  DatasetDict

# Initialize the new DatasetDict to store the smaller dataset
nspp_en_tiny = DatasetDict()
columns_to_keep = ['input_ids', 'attention_mask']
# Iterate over the splits in the encoded dataset
for split in nspp_encoded:
    # Use the map function to remove unwanted columns
    nspp_en_tiny[split] = nspp_encoded[split].map(
        lambda examples: {key: examples[key] for key in columns_to_keep},
        remove_columns=[col for col in nspp_encoded[split].column_names if col not in columns_to_keep]
    )


Map:   0%|          | 0/222 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [20]:
type(nspp_en_tiny)

datasets.dataset_dict.DatasetDict

In [21]:
nspp_en_tiny['train'].column_names


['input_ids', 'attention_mask']

In [22]:
start = time.perf_counter()
nspp_en_tiny.set_format('torch', columns=['input_ids', 'attention_mask'])
nspp_tiny_hidden = nspp_en_tiny.map(extract_hidden_states, batched=True)
end = time.perf_counter()
print(f'Inference took {end - start:0.4f} seconds.')

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Inference took 12.0586 seconds.


In [23]:
print(type(nspp_tiny_hidden))                   
nspp_tiny_hidden['train'].column_names

<class 'datasets.dataset_dict.DatasetDict'>


['input_ids', 'attention_mask', 'hidden_state']

In [29]:
nspp_encoded['train']['Date'][0]
NYdata_SP['train']['Date'][0]

Timestamp('2023-01-01 00:00:00')

In [34]:
nspp_encoded['train'] = nspp_encoded['train'].remove_columns('Date')
nspp_encoded['train'] = nspp_encoded['train'].add_column('Date', NYdata_SP['train']['Date'])

nspp_encoded['validation'] = nspp_encoded['validation'].remove_columns('Date')
nspp_encoded['validation'] = nspp_encoded['validation'].add_column('Date', NYdata_SP['validation']['Date'])

nspp_encoded['test'] = nspp_encoded['test'].remove_columns('Date')
nspp_encoded['test'] = nspp_encoded['test'].add_column('Date', NYdata_SP['test']['Date'])

In [37]:
from datasets import DatasetDict, Dataset


# Merging columns from the two DatasetDict objects
merged_dict = {}

# Assuming both DatasetDict objects have the same keys and same number of rows in corresponding datasets
for key in nspp_encoded.keys():
    # Extract datasets
    ds1 = nspp_encoded[key]
    ds2 = nspp_tiny_hidden[key]
    
    # Convert to pandas DataFrames
    df1 = ds1.to_pandas()
    df2 = ds2.to_pandas()
    
    # Concatenate DataFrames horizontally (axis=1)
    merged_df = pd.concat([df1[:][['Date','Close']], df2[:]['hidden_state']], axis=1)
    
    # Convert back to Dataset
    merged_dict[key] = Dataset.from_pandas(merged_df)

# Create a new DatasetDict with the merged datasets
merged_dataset_dict = DatasetDict(merged_dict)

# Print the merged DatasetDict to see the result
print(merged_dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['Date', 'Close', 'hidden_state'],
        num_rows: 222
    })
    validation: Dataset({
        features: ['Date', 'Close', 'hidden_state'],
        num_rows: 22
    })
    test: Dataset({
        features: ['Date', 'Close', 'hidden_state'],
        num_rows: 24
    })
})


In [38]:
(merged_dataset_dict.column_names)

{'train': ['Date', 'Close', 'hidden_state'],
 'validation': ['Date', 'Close', 'hidden_state'],
 'test': ['Date', 'Close', 'hidden_state']}

In [39]:
merged_dataset_dict['train']['Date'][0]

datetime.datetime(2023, 1, 1, 0, 0)

In [40]:

for split,data in merged_dataset_dict.items():
    #nspp_tiny_hidden[split] = data.remove_columns('hidden_state')
    print(data)
    print(split)
    

Dataset({
    features: ['Date', 'Close', 'hidden_state'],
    num_rows: 222
})
train
Dataset({
    features: ['Date', 'Close', 'hidden_state'],
    num_rows: 22
})
validation
Dataset({
    features: ['Date', 'Close', 'hidden_state'],
    num_rows: 24
})
test


In [41]:
merged_dataset_dict.save_to_disk('Merged_Close_feature_sp_date')


Saving the dataset (0/1 shards):   0%|          | 0/222 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/22 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24 [00:00<?, ? examples/s]

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
END OF PROCESSING DATA. NEXT WE NEED TO FIT THE MODEL.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [19]:
emotions_hidden['train']['hidden_state']

tensor([[-0.1168,  0.0986, -0.1296,  ...,  0.0587,  0.3543,  0.4042],
        [-0.0324, -0.0323, -0.1957,  ..., -0.1747,  0.3546,  0.3028],
        [ 0.0397,  0.2022,  0.1423,  ..., -0.1141,  0.3394,  0.3958],
        ...,
        [-0.0034, -0.0959,  0.0584,  ..., -0.0427,  0.2496,  0.3076],
        [ 0.0666,  0.1733,  0.1290,  ...,  0.0612,  0.2904,  0.4684],
        [ 0.0167,  0.1013, -0.0073,  ..., -0.0649,  0.3454,  0.2199]])

In [20]:
X_train = np.array(emotions_hidden['train']['hidden_state'])
X_valid = np.array(emotions_hidden['validation']['hidden_state'])
y_train = np.array(emotions_hidden['train']['label'])
y_valid = np.array(emotions_hidden['validation']['label'])
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(16000, 768) (2000, 768) (16000,) (2000,)


In [21]:
logistic_regression = LogisticRegression(max_iter=3000)
logistic_regression.fit(X_train, y_train)
logistic_regression.score(X_valid, y_valid)

0.6335