In [3]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
import csv

# Open and read the CSV file using the csv module
csv_file_path = "Nasdaq_stocks.csv"
rows = []

with open(csv_file_path, 'r', encoding='utf-8', errors='ignore') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        rows.append(row)

# Create a DataFrame from the list of rows
stocks = pd.DataFrame(rows[1:], columns=rows[0])

stocks.shape

(51024, 13)

In [20]:
stocks_data = stocks[['Date','Article_title','Stock_symbol']]
# Convert 'date' column to datetime type
stocks_data.loc[:, 'Date'] = pd.to_datetime(stocks_data['Date'], utc=True)

# Filter data for the years 2015 to 2023
start_date = pd.Timestamp('2019-01-01',tz="UTC")
end_date = pd.Timestamp('2023-12-31',tz="UTC")

stocks_data = stocks_data[(stocks_data['Date'] >= start_date) & (stocks_data['Date'] <= end_date)]
stocks_data.loc[:, 'Date'] = stocks_data['Date'].astype('str')
# Extract date component as string 'YYYY-MM-DD' using string manipulation
stocks_data.loc[:, 'Date'] = stocks_data['Date'].str.split().str[0]
stocks_data.shape

(46498, 3)

In [21]:
stocks_data = stocks_data.rename(columns={'Article_title': 'headlines'})
stocks_data["Stock_symbol"].value_counts()

Stock_symbol
GOOG    9883
NVDA    9533
AAPL    9338
INTC    9007
MSFT    8737
Name: count, dtype: int64

In [22]:
# create preprocess_text function
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# Get a list of English stopwords
#stopwords = set(stopwords.words('english'))
def preprocess_text(text):

    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

# apply the function stocks_data
stocks_data['headlines'] = stocks_data['headlines'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import gc

In [24]:

# Load pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Function to encode headlines
def encode_headlines(df):
    inputs = tokenizer(df['headlines'].tolist(), padding=True, truncation=True, return_tensors="pt")
    return inputs

In [26]:
# Function to perform sentiment analysis
def predict_sentiment(encoded_inputs):
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        predictions = torch.argmax(outputs.logits, dim=1)
    return predictions

# Function to map labels to sentiment
def map_to_sentiment(labels):
    sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return [sentiment_map[label.item()] for label in labels]

In [27]:
# Function to encode headlines and perform sentiment analysis on a batch
def process_batch(df):
    encoded_inputs = tokenizer(df['headlines'].tolist(), padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        predictions = torch.argmax(outputs.logits, dim=1)
    return predictions

# Iterate over the DataFrame in chunks and process each chunk separately
batch_size = 1000  # Adjust according to your available memory
sentiments = []

# Determine the number of chunks
num_chunks = len(stocks_data) // batch_size + 1

for i in range(num_chunks):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    chunk = stocks_data.iloc[start_idx:end_idx]
    predictions = process_batch(chunk)
    sentiments.extend(predictions)

# Map labels to sentiment
sentiments = map_to_sentiment(sentiments)

# Add the sentiments to your dataframe
stocks_data['Sentiment'] = sentiments

# Print the dataframe with sentiments
file_path = 'stocks_data_with_sentiment.csv'

stocks_data.sample(10)
# Write DataFrame to CSV
stocks_data.to_csv(file_path, index=False)


In [13]:
from concurrent.futures import ThreadPoolExecutor

# Function to process a single batch
def process_batch(df):
    encoded_inputs = tokenizer(df['headlines'].tolist(), padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        predictions = torch.argmax(outputs.logits, dim=1)
    return predictions

# Iterate over the DataFrame in chunks and process each chunk separately in parallel
batch_size = 1000  # Adjust according to your available memory
sentiments = []

# Determine the number of chunks
num_chunks = len(stocks_data) // batch_size + 1

# Process chunks in parallel
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_batch, stocks_data.iloc[i * batch_size: (i + 1) * batch_size]) for i in range(num_chunks)]
    for future in futures:
        predictions = future.result()
        sentiments.append(predictions)

# Concatenate predictions
sentiments = torch.cat(sentiments)

# Map labels to sentiment
sentiments = map_to_sentiment(sentiments)

# Add the sentiments to your dataframe
stocks_data['Sentiment'] = sentiments

# Print the dataframe with sentiments
file_path = 'stocks_data_with_sentiment.csv'

stocks_data.sample(10)
# Write DataFrame to CSV
stocks_data.to_csv(file_path, index=False)


In [30]:
stocks_data["Sentiment"].value_counts()

Sentiment
Neutral     44083
Negative     2415
Name: count, dtype: int64

In [28]:
stocks_data.tail()

Unnamed: 0,Date,headlines,Stock_symbol,Sentiment
50715,2019-01-02,'s expect micron ( mu ) stock 2019,INTC,Neutral
50716,2019-01-02,amd : top tech pick 2018 & p 500 's top perfor...,INTC,Neutral
50717,2019-01-02,microsoft ( msft ) debut surface go india via ...,INTC,Neutral
50718,2019-01-02,stock market future dive ; dow jones chipmaker...,INTC,Neutral
50719,2019-01-01,dow jones future : 'let 's make deal ' trump f...,INTC,Neutral
