In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
required_stocks = {
    "albemarle": "alb",
    "ganfeng": "gnenf",
    "livent": "lthm",
    "lithium americas": "lac",
    "lg chem": "051910",
    "toshiba corp": "tosyy",
    "panasonic": "pcrfy",
    "samsung": "005930"
}

In [3]:
all_timestamps = []
all_headlines = []
all_tickers = []

In [4]:
DATA_DIR = os.path.join("NewsData", "Kaggle")

### Process dataset 1

In [5]:
print("Dataset size:", len(all_tickers))

FILENAME = "analyst_ratings_processed.csv"
dataset = pd.read_csv(os.path.join(DATA_DIR, FILENAME))

TIMESTAMP_HEADER = 'date'
HEADLINE_HEADER = 'title'
TICKER_HEADER = 'stock'

dataset[HEADLINE_HEADER] = dataset[HEADLINE_HEADER].str.lower()
dataset[TICKER_HEADER] = dataset[TICKER_HEADER].str.lower()

Dataset size: 0


In [6]:
for stock_name, stock_ticker in required_stocks.items():
    stock_df = dataset[dataset['title'].str.contains(stock_name) | dataset['stock'].str.contains(stock_ticker)]

    all_timestamps.extend(stock_df[TIMESTAMP_HEADER].tolist())
    all_headlines.extend(stock_df[HEADLINE_HEADER].tolist())
    all_tickers.extend(stock_df[TICKER_HEADER].tolist())

In [7]:
print(len(all_timestamps), len(all_headlines), len(all_tickers))

2345 2345 2345


### Process dataset 2

In [8]:
print("Dataset size:", len(all_tickers))

FILENAME = "abcnews-date-text.csv"
dataset = pd.read_csv(os.path.join(DATA_DIR, FILENAME))

TIMESTAMP_HEADER = 'publish_date'
HEADLINE_HEADER = 'headline_text'

dataset[HEADLINE_HEADER] = dataset[HEADLINE_HEADER].str.lower()

Dataset size: 2345


In [9]:
for stock_name, stock_ticker in required_stocks.items():
    stock_df = dataset[
        dataset[HEADLINE_HEADER].str.contains(stock_name)
        | dataset[HEADLINE_HEADER].str.contains(" " + stock_ticker)
        | dataset[HEADLINE_HEADER].str.contains(stock_ticker + " ")]

    all_timestamps.extend(stock_df[TIMESTAMP_HEADER].tolist())
    all_headlines.extend(stock_df[HEADLINE_HEADER].tolist())
    all_tickers.extend([stock_ticker]*stock_df.shape[0])


### Process dataset 3

In [10]:
print("Dataset size:", len(all_tickers))

FILENAME = "RedditNews.csv"
dataset = pd.read_csv(os.path.join(DATA_DIR, FILENAME))

TIMESTAMP_HEADER = 'Date'
HEADLINE_HEADER = 'News'

dataset[HEADLINE_HEADER] = dataset[HEADLINE_HEADER].str.lower()

Dataset size: 5447


In [11]:
for stock_name, stock_ticker in required_stocks.items():
    stock_df = dataset[
        dataset[HEADLINE_HEADER].str.contains(stock_name)
        | dataset[HEADLINE_HEADER].str.contains(" " + stock_ticker)
        | dataset[HEADLINE_HEADER].str.contains(stock_ticker + " ")]

    all_timestamps.extend(stock_df[TIMESTAMP_HEADER].tolist())
    all_headlines.extend(stock_df[HEADLINE_HEADER].tolist())
    all_tickers.extend([stock_ticker]*stock_df.shape[0])

### Process dataset 4

In [12]:
print("Dataset size:", len(all_tickers))

FILENAME = "us_equities_news_dataset.csv"
dataset = pd.read_csv(os.path.join(DATA_DIR, FILENAME))

TIMESTAMP_HEADER = 'release_date'
HEADLINE_HEADER = 'title'
TICKER_HEADER = 'ticker'

dataset[HEADLINE_HEADER] = dataset[HEADLINE_HEADER].str.lower()
dataset[TICKER_HEADER] = dataset[TICKER_HEADER].str.lower()

Dataset size: 5727


In [13]:
for stock_name, stock_ticker in required_stocks.items():
    stock_df = dataset[dataset[HEADLINE_HEADER].str.contains(stock_name) | dataset[TICKER_HEADER].str.contains(stock_ticker)]

    all_timestamps.extend(stock_df[TIMESTAMP_HEADER].tolist())
    all_headlines.extend(stock_df[HEADLINE_HEADER].tolist())
    all_tickers.extend(stock_df[TICKER_HEADER].tolist())

### Process dataset 5

In [14]:
print("Dataset size:", len(all_tickers))

FILENAME = "raw_partner_headlines.csv"
dataset = pd.read_csv(os.path.join(DATA_DIR, FILENAME))

TIMESTAMP_HEADER = 'date'
HEADLINE_HEADER = 'headline'
TICKER_HEADER = 'stock'

dataset[HEADLINE_HEADER] = dataset[HEADLINE_HEADER].str.lower()
dataset[TICKER_HEADER] = dataset[TICKER_HEADER].str.lower()

Dataset size: 6471


In [15]:
for stock_name, stock_ticker in required_stocks.items():
    stock_df = dataset[dataset[HEADLINE_HEADER].str.contains(stock_name) | dataset[TICKER_HEADER].str.contains(stock_ticker)]

    all_timestamps.extend(stock_df[TIMESTAMP_HEADER].tolist())
    all_headlines.extend(stock_df[HEADLINE_HEADER].tolist())
    all_tickers.extend(stock_df[TICKER_HEADER].tolist())

### Create final dataset

In [16]:
INDEX_HEADER = 'idx'
SOURCE_HEADER = 'source'

final_df = pd.DataFrame(list(zip(all_timestamps, all_headlines, all_tickers)), columns =[TIMESTAMP_HEADER, HEADLINE_HEADER, TICKER_HEADER])
print("Before deduplication", final_df.shape)
# Remove duplicate rows
final_df = final_df.drop_duplicates()
print("After deduplication", final_df.shape)

Before deduplication (9917, 3)
After deduplication (9858, 3)


In [17]:
final_df.head()

Unnamed: 0,date,headline,stock
0,2018-04-03 12:15:00-04:00,stocks that made new 52-wk lows today include:...,ads
1,2020-05-29 07:15:00-04:00,34 stocks moving in friday's pre-market session,alb
2,2020-05-28 10:06:00-04:00,"deutsche bank maintains hold on albemarle, rai...",alb
3,2020-05-26 10:42:00-04:00,shares of several basic material companies are...,alb
4,2020-05-20 11:54:00-04:00,shares of several basic materials companies ar...,alb


In [18]:
final_df.to_csv('final_data.csv')