In [29]:
import pymongo
import pandas as pd
import re
import time
import datetime
from datetime import datetime, timedelta
import nltk

In [30]:
client = pymongo.MongoClient('localhost', 27017)
db = client.db
threads = db.threads
companies = db.companylist

In [31]:
df = pd.DataFrame.from_records(threads.find({'Label' :{'$ne': 0}}))

In [36]:
my_sent = df['Body'][5]

In [37]:
def get_continuous_chunks(text):
    chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == nltk.tree.Tree and i.label() == 'ORGANIZATION':
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        if current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue
    return continuous_chunk

In [38]:
matches = get_continuous_chunks(my_sent)

In [39]:
my_sent

'Hopefully my name isn t too biased here. But I will reiterate what people talk about any time AMD is mentioned in the past two months. 1 AMD s CPU leaked yesterday. And to say the least it looks promising. Article below as I will not go into depth on this. In summary Intel has had a better CPU for years but this looks like its changing. If priced similarly to previous CPU s AMD will no doubt be the leader in this price range. https //www.techradar.com/news/amd ryzen 7 5800x leak shows a powerhouse gaming cpu that could embarrass intels core i9 10900k 2 AMD/NVIDIA battle. For a long time NVIDIA has been the leader in the GPU battle. With the horrid launch of the 30xx series and recent leaks about AMD s GPU price to performance its looking more and more like AMD will take market share from NVIDIA. Many retailers report not even getting 3000 series cards Huge driver issues making games unplayable Scalpers/bots taking all the available cards and reselling 3 AMD is selling both XBOX and PS

In [40]:
companydf = pd.DataFrame.from_records(companies.find())

In [41]:
companydf['Name']

0                           10x Genomics, Inc.
1       1347 Property Insurance Holdings, Inc.
2       1347 Property Insurance Holdings, Inc.
3                     180 Degree Capital Corp.
4                      1-800-FLOWERS.COM, Inc.
                         ...                  
5716                               Zumiez Inc.
5717                            Zymeworks Inc.
5718             Zynerba Pharmaceuticals, Inc.
5719                               Zynex, Inc.
5720                                Zynga Inc.
Name: Name, Length: 5721, dtype: object

In [52]:
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]
    

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
company_names = companydf['Name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)



In [58]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [59]:
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 0.23461389541625977


In [60]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})
        

In [63]:
matches_df = get_matches_df(matches, company_names, top = 1000)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.sample(20)

Unnamed: 0,left_side,right_side,similairity
266,AllianzGI Convertible & Income Fund,AllianzGI Convertible & Income Fund II,0.968951
286,AllianzGI Equity & Convertible Income Fund,AllianzGI Convertible & Income Fund,0.800869
32,Aberdeen Global Dynamic Dividend Fund,Aberdeen Total Dynamic Dividend Fund,0.849906
381,"Ameri Holdings, Inc.",Everi Holdings Inc.,0.81048
276,AllianzGI Convertible & Income Fund II,AllianzGI Convertible & Income Fund,0.968951
727,ARYA Sciences Acquisition Corp II,ARYA Sciences Acquisition Corp III,0.969687
221,"Akero Therapeutics, Inc.","Spero Therapeutics, Inc.",0.804962
378,"Ameri Holdings, Inc.",Everi Holdings Inc.,0.81048
38,Aberdeen Total Dynamic Dividend Fund,Aberdeen Global Dynamic Dividend Fund,0.849906
281,AllianzGI Convertible & Income Fund II,AllianzGI Convertible & Income Fund,0.968951
