In [152]:
import pymongo
import pandas as pd
import numpy as np
import re
import time
import datetime
import nltk
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import bson

In [153]:
client = pymongo.MongoClient('localhost', 27017)
db = client.db
threads = db.threads
companies = db.companylist

df = pd.DataFrame.from_records(threads.find({'Label' :{'$ne': 0}}))
companydf = pd.DataFrame.from_records(companies.find())

In [154]:
def getContinuousChunks(text):
    chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    continuousChunk = []
    currentChunk = []
    for i in chunked:
        if type(i) == nltk.tree.Tree and i.label() in {'ORGANIZATION'}:
            currentChunk.append(" ".join([token for token, pos in i.leaves()]))
        if currentChunk:
            namedEntity = " ".join(currentChunk)
            if namedEntity not in continuousChunk:
                continuousChunk.append(namedEntity)
                currentChunk = []
        else:
            continue
    return continuousChunk

def mapResults(results, leftNames, rightNames, threadID):
    matchdf = pd.DataFrame(0, index = np.arange(len(result.nonzero()[0])), columns = ['_id','Left', 'Right', 'Similarity'])
    for i in range(len(result.nonzero()[0])):
        matchdf.loc[i, 'Left'] = leftNames[result.nonzero()[0][i]]
        matchdf.loc[i, 'Right'] = rightNames[result.nonzero()[1][i]]
        matchdf.loc[i, 'Similarity'] = result[result.nonzero()[0][i]][result.nonzero()[1][i]]
    matchdf['_id'] = threadID
    return matchdf

In [155]:
companyNames = companydf['Name'].unique()
companySymbols = companydf['Symbol'].unique()
vectorizer = TfidfVectorizer(min_df = 1)
companyMatrix = vectorizer.fit_transform(companyNames)
symbolMatrix = vectorizer.fit_transform(companySymbols)
matchdf = pd.DataFrame(columns = ['_id', 'Left', 'Right', 'Similarity'])

for i in range(len(df['Body'])):
    sent = df['Title'][i] + '. ' + df['Body'][i]
    threadID = df['_id'][i]
    ner = list(set(getContinuousChunks(sent)))

    if len(ner) > 0:
        nerMatrix = vectorizer.transform(pd.Series(ner))
        result = cosine_similarity(nerMatrix, symbolMatrix)
        result[result < 0.8] = 0
        matchdf = matchdf.append(mapResults(result, ner, companySymbols, threadID)).sort_values('Similarity', ascending = False)

In [156]:
matchdf

Unnamed: 0,_id,Left,Right,Similarity
0,5f766db0c41ae878a718020e,SPG Kimco Realty,SPG,1.000000
2,5f766db0c41ae878a7180233,KODK,KODK,1.000000
24,5f766db0c41ae878a718022d,PCG,PCG^D,1.000000
31,5f766db0c41ae878a718022d,PCG ENDED,PCG^B,1.000000
1,5f766db0c41ae878a7180212,IMO,IMO,1.000000
...,...,...,...,...
0,5f7b395a6d0682ca2381a870,SQQQ,SQQQ,1.000000
1,5f7b395a6d0682ca2381a876,CVAC MINIMUM,CVAC,1.000000
5,5f766db0c41ae878a7180228,FVAC,FVAC.WS,0.842509
2,5f766db0c41ae878a7180228,FAII,FAII.WS,0.842509


In [151]:
sent = df['Title'][5] + '. ' + df['Body'][5]
threadID = df['_id'][i]
ner = list(set(getContinuousChunks(sent)))

if len(ner) > 0:
    nerMatrix = vectorizer.transform(pd.Series(ner))
    result = cosine_similarity(nerMatrix, symbolMatrix)
    ##result[result < 0.8] = 0
mapResults(result, ner, companySymbols, threadID).sort_values('Similarity', ascending = False)

Unnamed: 0,_id,Left,Right,Similarity
0,5f7b395a6d0682ca2381a876,AMD VOL,AMD,1.0
1,5f7b395a6d0682ca2381a876,AMD GPU,AMD,1.0
2,5f7b395a6d0682ca2381a876,AMD,AMD,1.0
3,5f7b395a6d0682ca2381a876,AMD AMD,AMD,1.0
4,5f7b395a6d0682ca2381a876,AMD CPU,AMD,1.0
5,5f7b395a6d0682ca2381a876,AMD AMD AMD,AMD,1.0
6,5f7b395a6d0682ca2381a876,AMD IMPORTANT,AMD,1.0
7,5f7b395a6d0682ca2381a876,CPU AMD,AMD,1.0


In [146]:
ner

['ITM',
 'AMD VOL',
 'AMD GPU',
 'CPU',
 'PS5',
 'RDNA',
 'GPU',
 'NVIDIA',
 'AMD',
 'OTM',
 'XBOX',
 'AMD AMD',
 'AMD CPU',
 'Intel',
 'AMD AMD AMD',
 'AMD IMPORTANT',
 'CPU AMD']