In [3]:
import requests
import pandas as pd
from datetime import datetime
from datasets import Dataset
from transformers import pipeline, TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer
import torch

In [4]:



# Alpha Vantage API
ALPHA_VANTAGE_API_KEY = 'EX1OUD6X48E8ZGBL'
BASE_URL = 'https://www.alphavantage.co/query'

def fetch_stock_data(symbol, outputsize='compact'):
    params = {
        'function': 'TIME_SERIES_DAILY',
        'symbol': symbol,
        'apikey': ALPHA_VANTAGE_API_KEY,
        'outputsize': outputsize
    }
    response = requests.get(BASE_URL, params=params)
    data = response.json()
    if "Error Message" in data:
        raise ValueError(f"Error fetching data for {symbol}: {data['Error Message']}")
    return data

# Example usage
stock_data = fetch_stock_data('AAPL')
print(f"stock_data: {stock_data}")

# News API
NEWS_API_KEY = '36612e148e63493d81cde21f9ef75f66'
NEWS_URL = 'https://newsapi.org/v2/everything'

def fetch_news(query, from_date, to_date, language='en'):
    params = {
        'q': query,
        'from': from_date,
        'to': to_date,
        'language': language,
        'apiKey': NEWS_API_KEY
    }
    response = requests.get(NEWS_URL, params=params)
    data = response.json()
    if response.status_code != 200:
        raise ValueError(f"Error fetching news: {data.get('message', 'Unknown error')}")
    articles = data.get('articles', [])
    return articles

# Example usage
news_data = fetch_news('stock market', '2024-04-26', '2024-05-25')
print(f"news_data: {news_data}")


stock_data: {'Meta Data': {'1. Information': 'Daily Prices (open, high, low, close) and Volumes', '2. Symbol': 'AAPL', '3. Last Refreshed': '2024-05-31', '4. Output Size': 'Compact', '5. Time Zone': 'US/Eastern'}, 'Time Series (Daily)': {'2024-05-31': {'1. open': '191.4400', '2. high': '192.5700', '3. low': '189.9100', '4. close': '192.2500', '5. volume': '75158277'}, '2024-05-30': {'1. open': '190.7600', '2. high': '192.1800', '3. low': '190.6300', '4. close': '191.2900', '5. volume': '49947941'}, '2024-05-29': {'1. open': '189.6100', '2. high': '192.2470', '3. low': '189.5100', '4. close': '190.2900', '5. volume': '53068016'}, '2024-05-28': {'1. open': '191.5100', '2. high': '193.0000', '3. low': '189.1000', '4. close': '189.9900', '5. volume': '52280051'}, '2024-05-24': {'1. open': '188.8200', '2. high': '190.5800', '3. low': '188.0404', '4. close': '189.9800', '5. volume': '36326975'}, '2024-05-23': {'1. open': '190.9800', '2. high': '191.0000', '3. low': '186.6250', '4. close': '1

ValueError: Error fetching news: You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2024-04-30, but you have requested 2024-04-26. You may need to upgrade to a paid plan.

In [None]:
# Define a function to prepare the dataset for sentiment analysis
def prepare_dataset(articles):
    texts = [article['title'] + " " + article['content'] for article in articles]
    labels = [1 if 'positive' in text else 0 for text in texts]  # Simplified labeling
    return Dataset.from_dict({'text': texts, 'label': labels})


In [None]:
 #Example of preparing the dataset
dataset = prepare_dataset(news_data)
print(f"dataset: {dataset}")


dataset: Dataset({
    features: ['text', 'label'],
    num_rows: 100
})


In [None]:
dataset['text'][0]

"Stock market today: Indexes pop after Nvidia's blowout earnings report US stocks moved higher on Thursday, with investors cheering another blowout earnings report from chip giant Nvidia.\xa0\r\nThe company, whose chips are at the heart of the artificial intelligence boom, be… [+2044 chars]"

In [None]:

# Load a pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Check the tokenized datasets
print(f"tokenized_datasets: {tokenized_datasets}")

# Split the dataset into training and evaluation sets
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_data = train_test_split['train']
eval_data = train_test_split['test']

# Check the train and eval datasets
print(f"train_data: {train_data}")
print(f"eval_data: {eval_data}")

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')



Map: 100%|██████████| 100/100 [00:00<00:00, 746.02 examples/s]


tokenized_datasets: Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 100
})
train_data: Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 80
})
eval_data: Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 20
})


                                               
 33%|███▎      | 10/30 [03:21<05:24, 16.20s/it]

{'eval_loss': 0.24218781292438507, 'eval_runtime': 14.8736, 'eval_samples_per_second': 1.345, 'eval_steps_per_second': 0.202, 'epoch': 1.0}


                                               
 67%|██████▋   | 20/30 [06:01<02:24, 14.45s/it]

{'eval_loss': 0.07607702165842056, 'eval_runtime': 14.4616, 'eval_samples_per_second': 1.383, 'eval_steps_per_second': 0.207, 'epoch': 2.0}


100%|██████████| 30/30 [08:32<00:00, 14.66s/it]
100%|██████████| 30/30 [08:47<00:00, 17.57s/it]


{'eval_loss': 0.0504133515059948, 'eval_runtime': 14.4605, 'eval_samples_per_second': 1.383, 'eval_steps_per_second': 0.207, 'epoch': 3.0}
{'train_runtime': 526.9629, 'train_samples_per_second': 0.455, 'train_steps_per_second': 0.057, 'train_loss': 0.2532611529032389, 'epoch': 3.0}


ValidationError: Unable to open proto file: model. Please check if it is a valid proto. 

In [None]:
import torch
import onnxruntime
from onnxruntime.quantization import quantize_dynamic, QuantType

# Convert the PyTorch model to ONNX
onnx_model_path = "model.onnx"
dummy_input = torch.tensor(tokenizer.encode("This is a dummy input", add_special_tokens=True)).unsqueeze(0)
torch.onnx.export(
    model,
    dummy_input,
    onnx_model_path,
    input_names=['input_ids'],
    output_names=['output'],
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'}, 'output': {0: 'batch_size', 1: 'sequence_length'}}
)

# Quantize the ONNX model using ONNX Runtime
quantized_onnx_model_path = "gpt2_quantized.onnx"
quantize_dynamic(onnx_model_path, quantized_onnx_model_path, weight_type=QuantType.QUInt8)
print("Model quantized and saved as ONNX format.")

# Load the quantized ONNX model
ort_session = onnxruntime.InferenceSession(quantized_onnx_model_path)

def generate_blog_post(prompt):
    # Tokenize the input prompt
    input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt")
    
    # Adjust the input tensor dimensions
    max_length = 512
    input_ids = torch.nn.functional.pad(input_ids, (0, max_length - input_ids.shape[1]), value=tokenizer.pad_token_id)[:,:max_length]
    
    # Run inference using the quantized ONNX model
    ort_inputs = {ort_session.get_inputs()[0].name: input_ids.cpu().numpy()}
    ort_outputs = ort_session.run(None, ort_inputs)
    
    # Ensure the output is cast to integers
    ort_outputs_int = ort_outputs[0].astype(int)
    
    # Decode the generated text
    generated_text = tokenizer.decode(ort_outputs_int[0], skip_special_tokens=True)
    
    return generated_text

# Example usage
prompt = "How artificial intelligence is changing the world"
generated_blog_post = generate_blog_post(prompt)
print("Generated Blog Post:")
print(generated_blog_post)


IndexError: index out of range in self

Inferencing



In [None]:
import torch
import onnxruntime
from transformers import GPT2Tokenizer, GPT2Model
from onnxruntime.quantization import quantize_dynamic, QuantType

# Load model and tokenizer
model = GPT2Model.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Verify the tokenizer's vocabulary size
vocab_size = tokenizer.vocab_size

# Function to validate token IDs
def validate_token_ids(input_ids, vocab_size):
    if torch.any(input_ids >= vocab_size):
        raise ValueError(f"Token IDs must be within the range [0, {vocab_size - 1}]. Found out-of-range token ID.")

# Create a dummy input and validate it
dummy_input = torch.tensor(tokenizer.encode("This is a dummy input", add_special_tokens=True)).unsqueeze(0)
validate_token_ids(dummy_input, vocab_size)

# Convert the PyTorch model to ONNX
onnx_model_path = "model.onnx"
torch.onnx.export(
    model,
    dummy_input,
    onnx_model_path,
    input_names=['input_ids'],
    output_names=['output'],
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'}, 'output': {0: 'batch_size', 1: 'sequence_length'}}
)

# Quantize the ONNX model using ONNX Runtime
quantized_onnx_model_path = "gpt2_quantized.onnx"
quantize_dynamic(onnx_model_path, quantized_onnx_model_path, weight_type=QuantType.QUInt8)
print("Model quantized and saved as ONNX format.")

# Load the quantized ONNX model
ort_session = onnxruntime.InferenceSession(quantized_onnx_model_path)

def generate_blog_post(prompt):
    # Tokenize the input prompt
    input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt")
    
    # Validate token IDs
    validate_token_ids(input_ids, vocab_size)
    
    # Adjust the input tensor dimensions
    max_length = 512
    input_ids = torch.nn.functional.pad(input_ids, (0, max_length - input_ids.shape[1]), value=tokenizer.pad_token_id)[:,:max_length]
    
    # Run inference using the quantized ONNX model
    ort_inputs = {ort_session.get_inputs()[0].name: input_ids.cpu().numpy()}
    ort_outputs = ort_session.run(None, ort_inputs)
    
    # Ensure the output is cast to integers
    ort_outputs_int = ort_outputs[0].astype(int).flatten()
    
    # Decode the generated text
    generated_text = tokenizer.decode(ort_outputs_int, skip_special_tokens=True)
    
    return generated_text

# Example usage
prompt = "How artificial intelligence is changing the world"
generated_blog_post = generate_blog_post(prompt)
print("Generated Blog Post:")
print(generated_blog_post)





Model quantized and saved as ONNX format.


TypeError: sequence item 6: expected str instance, NoneType found

In [None]:
input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt")
input_ids 

tensor([[ 2437, 11666,  4430,   318,  5609,   262,   995]])

In [None]:
def generate_blog_post(prompt):
    # Tokenize the input prompt
    input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt")
    
    # Adjust the input tensor dimensions
    max_length = 512
    input_ids = torch.nn.functional.pad(input_ids, (0, max_length - input_ids.shape[1]), value=tokenizer.pad_token_id)[:,:max_length]
    
    # Run inference using the quantized ONNX model
    ort_inputs = {ort_session.get_inputs()[0].name: input_ids.cpu().numpy()}
    ort_outputs = ort_session.run(None, ort_inputs)
    
    # Ensure the output is cast to integers and flatten the array
    ort_outputs_int = ort_outputs[0].astype(int).flatten()
    
    # Decode the generated text, handling None values
    decoded_tokens = [tokenizer.decode([token_id], skip_special_tokens=True) if token_id is not None else "" for token_id in ort_outputs_int]
    generated_text = "".join(decoded_tokens)
    
    return generated_text

# Example usage
prompt = "How artificial intelligence is changing the world"
generated_blog_post = generate_blog_post(prompt)
print("Generated Blog Post:")
print(generated_blog_post)


TypeError: sequence item 0: expected str instance, NoneType found

In [None]:
# import pandas as pd
# from nltk.tokenize import word_tokenize
# import re

# def preprocess_text(text):
#     text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
#     text = re.sub(r'\W', ' ', text)   # Remove non-word characters
#     text = text.lower()               # Convert to lowercase
#     tokens = word_tokenize(text)      # Tokenize text
#     return ' '.join(tokens)

# # Example usage
# sample_text = "The stock market is booming in 2023!"
# cleaned_text = preprocess_text(sample_text)
# print(cleaned_text)


In [None]:
# from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# # Load pre-trained model and tokenizer
# model_name = "gpt2"
# model = GPT2LMHeadModel.from_pretrained(model_name)
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# # Tokenize and preprocess data
# def tokenize_function(examples):
#     return tokenizer(examples['text'], padding='max_length', truncation=True)

# # Prepare dataset
# train_data = pd.DataFrame(news_data)  # Assuming news_data is a list of news articles
# train_data
# train_data['content'][0]

In [None]:
# train_data['text'] = train_data['description'].apply(preprocess_text)
# train_data


In [None]:

# # Fine-tune model
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     num_train_epochs=3,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_data,
#     tokenizer=tokenizer,
# )

# trainer.train()

# # Quantize model using ONNX Runtime
# from onnxruntime.quantization import quantize_dynamic, QuantType

# onnx_model_path = "gpt2_quantized.onnx"
# model.save_pretrained("./model")
# tokenizer.save_pretrained("./model")
# quantize_dynamic("./model", onnx_model_path, weight_type=QuantType.QUInt8)
