In [None]:
import numpy
import csadata.twitter as twitter
import csadata.crypto as crypto
from datetime import datetime, timezone

The following script loads a small sample dataset of both Dogecoin price and Tweet data and joins them into a single
data set. The Tweet data is organized in a directory structure with a directory time interval of 1 day and a file time
interval of 15 minutes. The Dogecoin price data is given as candlestick data, covering time intervals of 15 minutes
each. Both data sets contain data from 2021/06/01 until 2021/06/06.
The resulting data set indicates for each Tweet if the price in the upcoming 15-minute interval increases or decreases.

In [None]:
dir_interval = 60 * 60 * 24 # 24 hour directory interval
file_interval = 60 * 15 # 15 minutes file interval

start = int(datetime(2021, 6, 1, 0, 0, 0, tzinfo=timezone.utc).timestamp()) # 06/01/2021 00:00:00
end = int(datetime(2021, 6, 6, 23, 59, 59, tzinfo=timezone.utc).timestamp()) # 06/06/2021 23:59:59

crypto_path = None # Insert the path to the Dogecoin price data CSV file or its directory

# Load price data
price_data = crypto.load_crypto_price_data(crypto.PRICE_DOGECOIN_USDT, crypto.INTERVAL_15MINUTE, int(start * 1000),
                                           int(end * 1000), crypto_path)
price_changes = numpy.column_stack(((price_data[:-1, crypto.IDX_OPEN_TIME] / 1000).astype(int),
                                    (price_data[:-1, crypto.IDX_CLOSE] - price_data[1:, crypto.IDX_CLOSE] > 0)
                                    .astype(int)))

# Use a dictionary for faster lookups when assigning price changes to single Tweets
price_change_dict = {}

# Assign price changes to the dictionary
for i in range(price_changes.shape[0]):
    price_change_dict[price_changes[i][0]] = price_changes[i][1]

The following step counts all Tweets in the given data set and initializes the corresponding data arrays.

In [None]:
num_tweets = 0
twitter_path = None # Insert the path to the Tweet data root directory

with twitter.TweetCSVReader(path=twitter_path, start=start, end=end) as csv_reader:
    for _ in csv_reader:
        num_tweets += 1

X = numpy.zeros((num_tweets, 7))
y = numpy.zeros((num_tweets, ))

We then load the actual Tweets and assign values to the data arrays. The value of
`X` is an `num_tweets`-by-7 NumPy array where each row corresponds to a single data point (Tweet) and with columns for
the like, reply, retweet and quote counts as well as for the negativity, neutrality and positivity sentiment values.
The value of `y` is a NumPy array of length `num_tweets` where each value indicates if the Dogecoin price in the
subsequent time interval closes lower (0) or higher (1) than in the current interval.

In [None]:
with twitter.TweetCSVReader(path=twitter_path, start=start, end=end) as csv_reader:
    for i, tweet in enumerate(csv_reader):
        # Look up price change based on the base time of the current time interval
        base_time = int(tweet.time // file_interval * file_interval)
        price_change = price_change_dict.get(base_time)

        if price_change is None:
            # End of whole time interval (no subsequent time interval)
            # TODO: Handle missing initializations in data array for these cases
            continue

        X[i, 0:4] = [tweet.like_count, tweet.reply_count, tweet.retweet_count, tweet.quote_count]
        # X[i, 5:] = numpy.array([negativity, neutrality, positivity]) -> Assign sentiment values for this Tweet
        y[i] = price_change

At this point we can use `X` and `y` into train respective classifiers.