In [1]:
TWEETS_FILE = "/content/drive/My Drive/project/dataset/preprocessed/tweets/dataset_model.csv"
USD_FILE = "/content/drive/My Drive/project/dataset/preprocessed/usd_prices/usd_prices_model.csv"
OUTPUT_TRAIN_FILE = "/content/drive/My Drive/project/dataset/model_data_train.csv"
OUTPUT_TEST_FILE = "/content/drive/My Drive/project/dataset/model_data_test.csv"

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
tweets = pd.read_csv(TWEETS_FILE).values
usd_prices = pd.read_csv(USD_FILE).values

print(tweets.shape)
print(usd_prices.shape)

(21831, 3)
(1377969, 5)


In [4]:
tweets = tweets[tweets[:, 1] >= 1484931600]
tweets = tweets[tweets[:, 1].argsort()]

tweet_groups = []
last_tweet_time = 0
for i in range(tweets.shape[0]):
  tweet = tweets[i]
  if tweet[1] <= last_tweet_time + 5*60:
    # Add to existing group
    tweet_groups[-1]["text"] += " " + tweet[2]
    tweet_groups[-1]["end"] = tweet[1]
    last_tweet_time = tweet[1]
  else:
    # Create new group
    tweet_groups.append({
      "text": tweet[2],
      "start": tweet[1],
      "end": tweet[1]
    })
    last_tweet_time = tweet[1]

In [5]:
len(tweet_groups)

17452

In [6]:
def max_by_abs(a, b):
  return a if abs(a) > abs(b) else b

In [7]:
usd_prices = usd_prices[usd_prices[:, 0] >= 1484931600]
usd_prices = usd_prices[usd_prices[:, 0].argsort()]

In [8]:
final_dataset = []
for i in range(len(tweet_groups)-1):
  group = tweet_groups[i]
  next_group = tweet_groups[i+1]
  prices = usd_prices[np.logical_and(usd_prices[:, 0] >= group["start"] , usd_prices[:, 0] <= next_group["start"])]
  if prices.shape[0] == 0:
    # No data available
    continue
  start = ( prices[0, 1] + prices[0, 2] ) / 2

  max_diff = 0
  for j in range(1, prices.shape[0]):
    max_diff = max_by_abs(max_diff, prices[j, 1] - start)
    max_diff = max_by_abs(max_diff, prices[j, 2] - start)

  final_dataset.append([
    group["text"],
    max_diff * 1000000.0
  ])

In [9]:
train_dataset, test_dataset = train_test_split(final_dataset, shuffle=True, test_size = 0.2, random_state=963)

In [10]:
train_df = pd.DataFrame(train_dataset, columns=["text", "labels"])
test_df = pd.DataFrame(test_dataset, columns=["text", "labels"])

In [11]:
print(train_df.describe())
print(test_df.describe())

             labels
count  10811.000000
mean      -4.531496
std     1803.494132
min   -25435.000000
25%     -540.000000
50%       90.000000
75%      555.000000
max    15115.000000
             labels
count   2703.000000
mean      -7.941176
std     1765.325704
min   -18445.000000
25%     -562.500000
50%      -50.000000
75%      545.000000
max    11775.000000


In [12]:
train_df.to_csv(OUTPUT_TRAIN_FILE, index=False)
test_df.to_csv(OUTPUT_TEST_FILE, index=False)