In [109]:
import pymongo
import pandas as pd
import numpy as np
import re
import time
import datetime
import nltk
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import bson

In [17]:
client = pymongo.MongoClient('localhost', 27017)
db = client.db
threads = db.threads
companies = db.companylist

df = pd.DataFrame.from_records(threads.find({'Label' :{'$ne': 0}}))
companydf = pd.DataFrame.from_records(companies.find())

In [87]:
def getContinuousChunks(text):
    chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    continuousChunk = []
    currentChunk = []
    for i in chunked:
        if type(i) == nltk.tree.Tree and i.label() in {'ORGANIZATION'}:
            currentChunk.append(" ".join([token for token, pos in i.leaves()]))
        if currentChunk:
            namedEntity = " ".join(currentChunk)
            if namedEntity not in continuousChunk:
                continuousChunk.append(namedEntity)
                currentChunk = []
        else:
            continue
    return continuousChunk
def ngrams(string, n=2):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

def mapResults(results, leftNames, rightNames, threadID):
    matchdf = pd.DataFrame(0, index = np.arange(len(result.nonzero()[0])), columns = ['_id','Left', 'Right', 'Similarity'])
    for i in range(len(result.nonzero()[0])):
        matchdf.loc[i, 'Left'] = leftNames[result.nonzero()[0][i]]
        matchdf.loc[i, 'Right'] = rightNames[result.nonzero()[1][i]]
        matchdf.loc[i, 'Similarity'] = result[result.nonzero()[0][i]][result.nonzero()[1][i]]
    matchdf['_id'] = threadID
    return matchdf

In [129]:
companyNames = companydf['Name'].unique()
vectorizer = TfidfVectorizer(min_df = 1)
companyMatrix = vectorizer.fit_transform(companyNames)
matchdf = pd.DataFrame(columns = ['_id', 'Left', 'Right', 'Similarity'])

for i in range(len(df['Body'])):
    sent = df['Title'][i] + '. ' + df['Body'][i]
    threadID = df['_id'][i]
    ner = list(set(getContinuousChunks(sent)))
    if len(ner) > 0:
        nerMatrix = vectorizer.transform(pd.Series(ner))
        result = cosine_similarity(nerMatrix, companyMatrix)
        result[result < 0.8] = 0
        matchdf = matchdf.append(mapResults(result, ner, companyNames, threadID)).sort_values('Similarity', ascending = False)

In [130]:
matchdf

Unnamed: 0,_id,Left,Right,Similarity
0,5f767e4e2dd138c701623f9e,Palantir Technologies Inc,Palantir Technologies Inc.,1.0
0,5f76ab9a5252806f931cb6ed,JNJ Johnson,Johnson & Johnson,1.0
0,5f767e4e2dd138c701623fc6,Celsius Holdings,"Celsius Holdings, Inc.",0.985512
2,5f7787fc1d41cb76fd113042,PayPal Holdings,"PayPal Holdings, Inc.",0.985512
0,5f7787fc1d41cb76fd113037,NIO,NIO Inc.,0.982853
0,5f767e4e2dd138c701623fb8,Tesla,"Tesla, Inc.",0.982853
0,5f766db0c41ae878a7180234,K12,K12 Inc,0.982853
0,5f766db0c41ae878a718021f,Intuit,Intuit Inc.,0.982853
0,5f766db0c41ae878a7180226,NIO,NIO Inc.,0.982853
0,5f767e4e2dd138c701623fb2,ROKU,"Roku, Inc.",0.982853


In [124]:
getContinuousChunks(df.loc[df['_id'] == bson.ObjectId('5f767e4e2dd138c701623fa6'), 'Body'][33])

['BABA Gang',
 'BABA',
 'Truist',
 'RBC',
 'Merrill',
 'BABA BABA',
 'BABA PRC',
 'BABA Ant Group Cainiao Taobao',
 'IPO',
 'BABA Ant Group',
 'IPOs',
 'US',
 'BABA PRC PRC']

In [100]:
df['_id'][0]

ObjectId('5f766db0c41ae878a718020e')

0                           10x Genomics, Inc.
1                                    111, Inc.
2       1347 Property Insurance Holdings, Inc.
3       1347 Property Insurance Holdings, Inc.
4                     180 Degree Capital Corp.
                         ...                  
7296                               Zuora, Inc.
7297                            Zymeworks Inc.
7298             Zynerba Pharmaceuticals, Inc.
7299                               Zynex, Inc.
7300                                Zynga Inc.
Name: Name, Length: 7301, dtype: object

In [128]:
len(companydf['Name'].unique())

6020