In [1]:
import pymongo
import pandas as pd
import numpy as np
import re
import time
import datetime
import nltk
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bson.objectid import ObjectId

In [2]:
client = pymongo.MongoClient('localhost', 27017)
db = client.db
threads = db.threads
companies = db.companylist
matches = db.matches
chunks = db.chunks
nltk.download('vader_lexicon')

df = pd.DataFrame.from_records(threads.find({'Label':{'$ne': 0}}))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
sir = SentimentIntensityAnalyzer()
sent = df.loc[77,'Title'] + ' ' + df.loc[77,'Body']
sir.polarity_scores(sent)['compound']
count = 0

for i in range(len(df)):
    sent = df.loc[i,'Title'] + ' ' + df.loc[i,'Body']
    score = sir.polarity_scores(sent)['compound']
    if (score < 0 and df.loc[i, 'Label'] == 'bullish') or (score > 0 and df.loc[i,'Label'] == 'bearish'):
        print(str(score) + ':' + df.loc[i, 'Label'])
        count = count + 1

-0.9077:bullish
-0.7453:bullish
-0.4522:bullish
-0.9222:bullish
0.9274:bearish
0.8258:bearish
0.1027:bearish
-0.6037:bullish
-0.9495:bullish
0.8022:bearish
-0.5267:bullish
-0.5994:bullish
0.3716:bearish
0.8689:bearish
-0.743:bullish
0.4791:bearish
-0.608:bullish
0.2263:bearish
-0.8371:bullish
-0.8185:bullish
0.6326:bearish
0.9801:bearish
-0.0874:bullish
-0.953:bullish
-0.25:bullish
-0.5574:bullish
0.5978:bearish
0.9976:bearish
-0.8658:bullish
-0.4246:bullish
-0.9578:bullish
-0.3182:bullish
-0.1027:bullish
-0.5829:bullish
-0.2313:bullish
-0.8578:bullish
0.7506:bearish
0.7063:bearish
-0.5789:bullish
-0.4939:bullish
0.8266:bearish
-0.3533:bullish
-0.4767:bullish
-0.3249:bullish
-0.1927:bullish
0.4404:bearish
0.323:bearish
-0.7814:bullish
0.296:bearish
-0.7025:bullish
0.9917:bearish
-0.5106:bullish
-0.599:bullish
0.8943:bearish
0.8957:bearish
-0.6369:bullish
0.9042:bearish
-0.5106:bullish
0.8236:bearish
0.9208:bearish
-0.1027:bullish
0.6739:bearish
0.9169:bearish
-0.3972:bullish
-0.1655:bu

In [4]:
import flair
from flair.models import TextClassifier
from flair.data import Sentence

def cleanText(text):
    return re.sub("http[s]?://\S+", ' ', text) #Remove links

In [5]:
classifier = TextClassifier.load('en-sentiment')
sentence = Sentence(sent)
classifier.predict(sentence)

2020-10-29 21:27:11,966 loading file /home/user/.flair/models/sentiment-en-mix-distillbert_3.1.pt


In [6]:
labelDf = pd.DataFrame(columns = ['Label', 'Text'])
labelDf['Text'] = df['Title'] + ' ' + df['Body']
labelDf['Label'] = df['Label']
labelDf['Text'] = labelDf['Text'].apply(cleanText)

In [7]:
labelDf['Label'] = '__label__' + labelDf['Label'].astype(str)

In [8]:
labelDf = labelDf.sample(frac = 1)

In [9]:
labelDf.iloc[0: int(len(labelDf)*0.8)].to_csv('train.csv', sep = '\t', index = False, header = False)
labelDf.iloc[int(len(labelDf)*0.8): int(len(labelDf)*0.9)].to_csv('test.csv', sep = '\t', index = False, header = False)
labelDf.iloc[int(len(labelDf)*0.9): ].to_csv('dev.csv', sep = '\t', index = False, header = False)

In [10]:
from flair.datasets import ClassificationCorpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path
corpus = ClassificationCorpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')
word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=10)

2020-10-29 21:27:14,220 Reading data from .
2020-10-29 21:27:14,222 Train: train.csv
2020-10-29 21:27:14,226 Dev: dev.csv
2020-10-29 21:27:14,229 Test: test.csv
2020-10-29 21:27:16,153 Computing label dictionary. Progress:
100%|██████████| 971/971 [00:06<00:00, 158.21it/s]2020-10-29 21:27:23,515 [b'neutral', b'bullish', b'bearish']

2020-10-29 21:27:23,556 ----------------------------------------------------------------------------------------------------
2020-10-29 21:27:23,558 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageMod