In [90]:
#!pip install textblob
!pip install torch

Collecting torch
  Using cached torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting filelock (from torch)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (

In [3]:
import glob
import math
import csv
import torch
import numpy as np
import json
from textblob import TextBlob
import shutil
import os
from datetime import datetime, timedelta
from scipy import stats
import matplotlib.pyplot as plt

# Adding original files

In [10]:
source_dir = '../Dataset/price/raw/'
destination_dir = 'preprocessed-data/input-data/'

for filename in os.listdir(source_dir):
    if filename.endswith('.csv'):
        source_file = os.path.join(source_dir, filename)
        destination_file = os.path.join(destination_dir, filename)
        
        shutil.copy2(source_file, destination_file)

print("All files copied successfully!")

All files copied successfully!


# Adding twitter sentiment and target value
Analyzing tweet messages and outputting either 1 (positive), 0 (neutral) or -1 (negative) for each day per stock. Also adding the cumulative value with a penalty of 0.5 for neutral messages.
Thanks to this jupyter for part of the code:

https://github.com/mrizwan18/Stock-Prediction-usingTwitter-sentimental-analysis-and-CNN/blob/master/Stock-Predictor.ipynb

In [1]:
def date_range(start_date, end_date):
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    delta = timedelta(days=1)
    dates = []
    while start <= end:
        dates.append(start.strftime("%Y-%m-%d"))
        start += delta
    return dates

stock_sentiments = {}

for stock_dir in glob.glob("../Dataset/tweet/raw/*"):
    stock_name = os.path.basename(stock_dir)
    stock_sentiments[stock_name] = []

    for file in glob.glob(f"{stock_dir}/*"):
        name = file.split("/")[-1]
        date_str = name.split(".")[0]
        tweets = []

        ## Twitter sentiment
        with open(file, "r") as infile:
            for line in infile:
                t = json.loads(line)
                tweet = {"text": t["text"], "weight": t["favorite_count"] + t["retweet_count"] + 1}
                tweets.append(tweet)
                
            # pos, neg, ntrl
            sentiment = [0, 0, 0]
            for tweet in tweets:
                t = TextBlob(tweet["text"])
                if t.sentiment[0]>0:
                   sentiment[0] += tweet["weight"]
                elif t.sentiment[0]<0:
                   sentiment[1] += tweet["weight"]
                else:
                   sentiment[2] += tweet["weight"]
                    
            i = np.argmax(sentiment)
            if(i == 0):
                o = 1
            elif(i == 1):
                o = -1
            else:
                o = 0
            
            stock_sentiments[stock_name].append((date_str, o))

for stock_name, sentiments in stock_sentiments.items():
    sentiments.sort(key=lambda x: x[0])

    first_date = sentiments[0][0]
    last_date = sentiments[-1][0]
    all_dates = date_range(first_date, last_date)

    sentiment_dict = {date: sentiment for date, sentiment in sentiments}
    completed_sentiments = []
    
    y = 0
    for date in all_dates:
        s = sentiment_dict.get(date, 0)
        y += s
        completed_sentiments.append((date, s, y))
    
    stock_sentiments[stock_name] = {date: {'Sentiment': s, 'Cum_Sentiment': y} for date, s, y in completed_sentiments}

    output_file = f"preprocessed-data/sentiment/{stock_name}.txt"
    with open(output_file, "w") as outfile:
        for date, s, y in completed_sentiments:
            outfile.write(f"{date} {s} {y}\n")

print("Sentiment values added")

NameError: name 'input_stocks' is not defined

In [67]:
for file in glob.glob("../Dataset/price/raw/*.csv"):
    name = os.path.basename(file)
    stock_name = name.split(".")[0]

    if stock_name not in stock_sentiments:
        continue

    sentiment_data = stock_sentiments[stock_name]

    with open(file, "r") as infile, open("preprocessed-data/input-data/" + name, "w", newline='') as outfile:
        counter = 0
        seven_days_count = []
        thirty_days_count = []
        one_year_count = []
        open_prices_7 = []
        open_prices_30 = []
        open_prices_365 = []
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        
        previous_open_price = None
    
        header = next(reader)
        header_indices = {column_name: index for index, column_name in enumerate(header)}
        date_index = header_indices.get('Date')
        open_index = header_indices.get('Open')
        close_index = header_indices.get('Close')
        
        new_header = header + [
            "Sentiment",
            "Cum_sentiment",
            "Seven_days_count",
            "Thirty_days_count",
            "One_year_count",
            "7_day_SMA",
            "30_day_SMA",
            "365_day_SMA",
            "Daily_Return",
            "Target"
        ]
        writer.writerow(new_header)

        
        for row in reader:
            counter+=1
            date_str = row[date_index]

            try:
                current_open_price = float(row[open_index])
            except ValueError:
                continue

            open_prices_7.append(current_open_price)
            open_prices_30.append(current_open_price)
            open_prices_365.append(current_open_price)

            if len(open_prices_7) > 7:
                open_prices_7.pop(0)
            if len(open_prices_30) > 30:
                open_prices_30.pop(0)
            if len(open_prices_365) > 365:
                open_prices_365.pop(0)
            
            moving_average_7 = sum(open_prices_7) / len(open_prices_7) if len(open_prices_7) == 7 else None
            moving_average_30 = sum(open_prices_30) / len(open_prices_30) if len(open_prices_30) == 30 else None
            moving_average_365 = sum(open_prices_365) / len(open_prices_365) if len(open_prices_365) == 365 else None

            if previous_open_price is not None:
                daily_return = (current_open_price - previous_open_price) / previous_open_price
            else:
                daily_return = ''
            
            if previous_open_price is not None:
                if current_open_price > previous_open_price:
                    target = 1
                else:
                    target = 0
                seven_days_count.append(target)
                thirty_days_count.append(target)
                one_year_count.append(target)
            else:
                target = None

            ## Counting how many times the price has gone up during the last week, month or year
            if len(seven_days_count) > 7:
                seven_days_count.pop(0)
            if len(thirty_days_count) > 30:
                thirty_days_count.pop(0)
            if len(one_year_count) > 365:
                one_year_count.pop(0)

            previous_open_price = current_open_price

            sentiment_info = sentiment_data.get(date_str, {'Sentiment': 0, 'Cum_Sentiment': 0})
            sentiment = sentiment_info['Sentiment']
            cum_sentiment = sentiment_info['Cum_Sentiment']


            new_row = row + [
                sentiment,
                cum_sentiment,
                seven_days_count.count(1),
                thirty_days_count.count(1),
                one_year_count.count(1),
                moving_average_7,
                moving_average_30,
                moving_average_365,
                daily_return,
                target
            ]
            
            writer.writerow(new_row)
    
print('Added target and sentiment values to CSV files')

Added target and sentiment values to CSV files
