In [152]:
import pymongo
import pandas as pd
import numpy as np
import re
import time
import datetime
import nltk
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import bson

In [153]:
client = pymongo.MongoClient('localhost', 27017)
db = client.db
threads = db.threads
companies = db.companylist

df = pd.DataFrame.from_records(threads.find({'Label' :{'$ne': 0}}))
companydf = pd.DataFrame.from_records(companies.find())

In [189]:
def getContinuousChunks(text):
    chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    continuousChunk = []
    currentChunk = []
    for i in chunked:
        if type(i) == nltk.tree.Tree and i.label() in {'ORGANIZATION'}:
            currentChunk.append(" ".join([token for token, pos in i.leaves()]))
        if currentChunk:
            namedEntity = " ".join(currentChunk)
            if namedEntity not in continuousChunk:
                continuousChunk.append(namedEntity)
                currentChunk = []
        else:
            continue
    return continuousChunk

def mapResults(result, leftNames, rightNames, threadID):
    matchdf = pd.DataFrame(0, index = np.arange(len(result.nonzero()[0])), columns = ['_id','Left', 'Right', 'Similarity'])
    for i in range(len(result.nonzero()[0])):
        matchdf.loc[i, 'Left'] = leftNames[result.nonzero()[0][i]]
        matchdf.loc[i, 'Right'] = rightNames[result.nonzero()[1][i]]
        matchdf.loc[i, 'Similarity'] = result[result.nonzero()[0][i]][result.nonzero()[1][i]]
    matchdf['_id'] = threadID
    return matchdf.drop_duplicates(subset = 'Right')

In [192]:
companyNames = companydf['Name'].unique()
companySymbols = companydf['Symbol'].unique()
nameVectorizer = TfidfVectorizer(min_df = 1)
symbolVectorizer = TfidfVectorizer(min_df = 1)
companyMatrix = nameVectorizer.fit_transform(companyNames)
symbolMatrix = symbolVectorizer.fit_transform(companySymbols)
matchdf = pd.DataFrame(columns = ['_id', 'Left', 'Right', 'Similarity'])

for i in range(len(df['Body'])):
    sent = df['Title'][i] + '. ' + df['Body'][i]
    threadID = df['_id'][i]
    ner = list(set(getContinuousChunks(sent)))

    if len(ner) > 0:
        nerNameMatrix = nameVectorizer.transform(pd.Series(ner))
        nerSymbolMatrix = symbolVectorizer.transform(pd.Series(ner))
        nameResult = cosine_similarity(nerNameMatrix, companyMatrix)
        nameResult[nameResult < 0.8] = 0
        symbolResult = cosine_similarity(nerSymbolMatrix, symbolMatrix)
        symbolResult[symbolResult < 1] = 0
        matchdf = matchdf.append(mapResults(nameResult, ner, companyNames, threadID)).sort_values('Similarity', ascending = False)
        matchdf = matchdf.append(mapResults(symbolResult, ner, companySymbols, threadID)).sort_values('Similarity', ascending = False)

In [167]:
df.loc[df['_id'] == threadID, 'Stocks']

83    0
Name: Stocks, dtype: int64

In [193]:
matchdf

Unnamed: 0,_id,Left,Right,Similarity
0,5f767e4e2dd138c701623f9e,Palantir Technologies Inc,Palantir Technologies Inc.,1.000000
0,5f767e4e2dd138c701623fbb,LULU,LULU,1.000000
3,5f766db0c41ae878a718022e,JPM JPM,JPM^G,1.000000
0,5f767e4e2dd138c701623fbc,SSL,SSL,1.000000
7,5f766db0c41ae878a718022d,PCG PCG,PCG^G,1.000000
...,...,...,...,...
0,5f767e4e2dd138c701623fa6,Truist,Truist Financial Corporation,0.848358
0,5f767e4e2dd138c701623fb0,FB,FB Financial Corporation,0.848358
1,5f767e4e2dd138c701623fb7,Worthington,"Worthington Industries, Inc.",0.841008
0,5f766db0c41ae878a7180210,SpaceX VG,VG Acquisition Corp.,0.826580
