In [152]:
import pymongo
import pandas as pd
import numpy as np
import re
import time
import datetime
import nltk
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import bson

In [395]:
client = pymongo.MongoClient('localhost', 27017)
db = client.db
threads = db.threads
companies = db.companylist
matches = db.matches
chunks = db.chunks

df = pd.DataFrame.from_records(threads.find({'Label' :{'$ne': 0}}))
companydf = pd.DataFrame.from_records(companies.find())

In [348]:
def getContinuousChunks(text):
    chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    continuousChunkdf = pd.DataFrame(columns = ['id', 'Named Entity', 'Label'])
    currentChunk = []
    currentLabel = []
    for i in chunked:
        if type(i) == nltk.tree.Tree:
            currentChunk.append(" ".join([token for token, pos in i.leaves()]))
            currentLabel.append(i.label())
        if currentChunk:
            namedEntity = " ".join(currentChunk)
            label = " ".join(currentLabel)
            if namedEntity not in continuousChunkdf['Named Entity']:
                d = {'id': 0, 'Named Entity': namedEntity, 'Label': label}
                continuousChunkdf = continuousChunkdf.append(d, ignore_index = True)
                currentChunk = []
                currentLabel = []
        else:
            continue
    return continuousChunkdf

def mapResults(result, leftNames, rightNames, threadID):
    matchdf = pd.DataFrame(0, index = np.arange(len(result.nonzero()[0])), columns = ['id','Left', 'Right', 'Similarity'])
    for i in range(len(result.nonzero()[0])):
        matchdf.loc[i, 'Left'] = leftNames[result.nonzero()[0][i]]
        matchdf.loc[i, 'Right'] = rightNames[result.nonzero()[1][i]]
        matchdf.loc[i, 'Similarity'] = result[result.nonzero()[0][i]][result.nonzero()[1][i]]
    matchdf['id'] = threadID
    return matchdf.drop_duplicates(subset = 'Right')

In [390]:
companyNames = companydf['Name'].unique()
companySymbols = companydf['Symbol'].unique()
nameVectorizer = TfidfVectorizer(min_df = 1)
symbolVectorizer = TfidfVectorizer(min_df = 1)
companyMatrix = nameVectorizer.fit_transform(companyNames)
symbolMatrix = symbolVectorizer.fit_transform(companySymbols)
matchdf = pd.DataFrame(columns = ['id', 'Left', 'Right', 'Similarity'])
nerdf = pd.DataFrame(columns = ['id', 'Named Entity', 'Label'])

for i in range(len(df['Body'])):
    sent = df['Title'][i] + '. ' + df['Body'][i]
    threadID = df['_id'][i]
    chunkdf = getContinuousChunks(sent)
    chunkdf['id'] = threadID
    nerdf = nerdf.append(chunkdf)

    if len(chunkdf.loc[chunkdf['Label'] == 'ORGANIZATION']) > 0:
        nerNameMatrix = nameVectorizer.transform(chunkdf.loc[chunkdf['Label'] == 'ORGANIZATION','Named Entity'])
        nerSymbolMatrix = symbolVectorizer.transform(chunkdf.loc[chunkdf['Label'] == 'ORGANIZATION','Named Entity'])
        nameResult = cosine_similarity(nerNameMatrix, companyMatrix)
        nameResult[nameResult < 0.8] = 0
        symbolResult = cosine_similarity(nerSymbolMatrix, symbolMatrix)
        symbolResult[symbolResult < 1] = 0
        namedf = mapResults(nameResult, chunkdf['Named Entity'], companyNames, threadID)
        symboldf = mapResults(symbolResult, chunkdf['Named Entity'], companySymbols, threadID)
        matchdf = matchdf.append(namedf)
        matchdf = matchdf.append(symboldf)
    stocklist = {key: None for key in set(namedf['Right'].append(symboldf['Right']))}
    df.loc[df['_id'] == threadID, 'Stocks'] = [stocklist]


In [391]:
matchdf

Unnamed: 0,id,Left,Right,Similarity
0,5f766db0c41ae878a718020e,Vornado Realty Trust,Kimco Realty Corporation,0.961022
0,5f766db0c41ae878a718020e,Vornado Realty Trust,SPG,1.000000
1,5f766db0c41ae878a718020e,Vornado Realty Trust,SPG^J,1.000000
0,5f766db0c41ae878a7180210,SPCE Virgin Galactic,"Virgin Galactic Holdings, Inc.",0.948405
1,5f766db0c41ae878a7180210,SPCE,VG Acquisition Corp.,0.826580
...,...,...,...,...
0,5f7b395a6d0682ca2381a86f,TTM Squeeze,"TTM Technologies, Inc.",0.863814
0,5f7b395a6d0682ca2381a86f,AMD Technicals,AMD,1.000000
1,5f7b395a6d0682ca2381a86f,TTM Squeeze,TTM,1.000000
0,5f7b395a6d0682ca2381a870,France Spain,SQQQ,1.000000


In [392]:
nerdf

Unnamed: 0,id,Named Entity,Label
0,5f766db0c41ae878a718020e,New York,GPE
1,5f766db0c41ae878a718020e,Vornado Realty Trust,PERSON
2,5f766db0c41ae878a718020e,Cali Realty CLI,PERSON
3,5f766db0c41ae878a718020e,AvalonBay,ORGANIZATION
4,5f766db0c41ae878a718020e,Boston Properties,PERSON
...,...,...,...
0,5f7b395a6d0682ca2381a876,CVAC,ORGANIZATION
1,5f7b395a6d0682ca2381a876,Oblivion,GPE
2,5f7b395a6d0682ca2381a876,CVAC,ORGANIZATION
3,5f7b395a6d0682ca2381a876,MINIMUM,ORGANIZATION


In [393]:
df

Unnamed: 0,_id,Inserted Date,Title,Body,Tag,Label,Stocks
0,5f766db0c41ae878a718020e,2020-10-01 23:18:55.840,List of companies that are fucked next year,For those who aren t aware new york is going i...,DD,bearish,"{'Kimco Realty Corporation': None, 'SPG^J': No..."
1,5f766db0c41ae878a7180210,2020-10-01 23:18:55.840,VIRGIN GALACTIC ROCKET SHIPS ARE COOL PT $180 ...,VIRGIN GALACTIC ROCKET SHIPS ARE COOL PT $180...,DD,bullish,"{'SPCE': None, 'VG': None, 'LOVE': None, 'NYC'..."
2,5f766db0c41ae878a7180212,2020-10-01 23:18:55.840,Updated S P500 Technical Analysis Bears Are St...,Last week I shared my S P500 forecast and brok...,Technicals,bullish,"{'MMS': None, 'IMO': None, 'CAN': None}"
3,5f766db0c41ae878a7180215,2020-10-01 23:18:55.840,Procter Gamble to the fucking MOON DD inside,Procter Gamble 10/27/2020 Q1 Earnings Play NY...,DD,bullish,"{'Unilever PLC': None, 'TSLA': None, 'NEW': No..."
4,5f766db0c41ae878a7180217,2020-10-01 23:18:55.840,WHY IS NO ONE TALKING ABOUT BABA TODAY,This sub introduced me to BABA. I am BABA gang...,Discussion,bullish,{'BABA': None}
...,...,...,...,...,...,...,...
79,5f7787fc1d41cb76fd11304d,2020-10-02 20:05:16.466,Snowflake lockup period,Hi Snowflake has gone up a lot since IPO. I as...,Discussion,bearish,{}
80,5f7787fc1d41cb76fd11304e,2020-10-02 20:05:16.466,CHWY go go go,CHWY LISTEN UP RETARDS chwy was moon for the p...,YOLO,bullish,{'CHWY': None}
81,5f7b395a6d0682ca2381a86f,2020-10-05 15:18:50.943,AMD going to $100 This Month,I m looking at AMD Technicals https //imgur.co...,Technicals,bullish,"{'TTM Technologies, Inc.': None, 'TTM': None, ..."
82,5f7b395a6d0682ca2381a870,2020-10-05 15:18:50.943,Actual Second Wave,Back in early summer this year the US had a se...,DD,bearish,{'SQQQ': None}


In [394]:
bulk = threads.initialize_ordered_bulk_op()
for _, row in df.iterrows():
    _id = row['_id']
    newVal = row['Stocks']
    bulk.find({'_id': _id}).update_one({'$set':{'Stocks':newVal}})
bulk.execute()

{'writeErrors': [],
 'writeConcernErrors': [],
 'nInserted': 0,
 'nUpserted': 0,
 'nMatched': 84,
 'nModified': 84,
 'nRemoved': 0,
 'upserted': []}

In [None]:
for _, row in matchdf.iterrows():
    _id = row['id']
    newVal = {'Left': row['Left'], 'Right': row['Right'], 'Similarity': row['Similarity']}
    bulk.find({'_id': _id}).update_one({'$set': newVal})
bulk.execute()

In [399]:
for _, row in nerdf.iterrows():
    _id = row['id']
    newVal = {'Named Entity': row['Named Entity'], 'Label': row['Label']}
    bulk.find({'_id': _id}).update_one({'$set': newVal})
bulk.execute()

Unnamed: 0,id,Left,Right,Similarity
0,5f766db0c41ae878a718020e,Vornado Realty Trust,Kimco Realty Corporation,0.961022
0,5f766db0c41ae878a718020e,Vornado Realty Trust,SPG,1.000000
1,5f766db0c41ae878a718020e,Vornado Realty Trust,SPG^J,1.000000
0,5f766db0c41ae878a7180210,SPCE Virgin Galactic,"Virgin Galactic Holdings, Inc.",0.948405
1,5f766db0c41ae878a7180210,SPCE,VG Acquisition Corp.,0.826580
...,...,...,...,...
0,5f7b395a6d0682ca2381a86f,TTM Squeeze,"TTM Technologies, Inc.",0.863814
0,5f7b395a6d0682ca2381a86f,AMD Technicals,AMD,1.000000
1,5f7b395a6d0682ca2381a86f,TTM Squeeze,TTM,1.000000
0,5f7b395a6d0682ca2381a870,France Spain,SQQQ,1.000000
