In [None]:
# When on Google Colab, run this first, then Restart Kernel, and then load the libraries
!python -m spacy download en_core_web_lg

In [30]:
import joblib
import pandas as pd
import spacy
from tqdm import tqdm
nlp = spacy.load("en_core_web_lg")

In [91]:
# SAMPLE
# Download a spacy model for processing English
# !python -m spacy download en_core_web_lg
# nlp = spacy.load("en_core_web_sm")

# Process a sentence using the spacy model
doc = nlp('Windows 7 users should be aware by now that while a fair number of flaws addressed this month \
            by Microsoft affect Windows 7 systems, this operating system is no longer being supported with \
            security updates (unless you’re an enterprise taking advantage of Microsoft’s paid extended \
            security updates program, which is available to Windows 7 Professional and Windows 7 enterprise \
            users). Perhaps most troubling of these (CVE-2020-1301) is a remote code execution bug in SMB \
            capabilities built into Windows 7 and Windows Server 2008 systems — both operating systems that \
            Microsoft stopped supporting with security updates in January 2020. Microsoft Office and Excel get \
            several updates this month. Microsoft today released software patches to plug at least 129 security \
            holes in its Windows operating systems and supported software, by some accounts a record number of \
            fixes in one go for the software giant')
# doc = nlp(text)
# Display the entities found by the model, and the type of each.
print('{:<12}  {:}\n'.format('Entity', 'Type'))

# For each entity found...
for ent in doc.ents:
    
    # Print the entity text `ent.text` and its label `ent.label_`.
    print('{:<20}  {:}'.format(ent.text, ent.label_))

Entity        Type

7                     CARDINAL
this month            DATE
Microsoft             ORG
7                     CARDINAL
Microsoft             ORG
Windows 7 Professional  ORG
CVE-2020-1301         CARDINAL
SMB                   PRODUCT
Windows 7             PRODUCT
2008                  DATE
Microsoft             ORG
January 2020          DATE
Microsoft Office      ORG
Excel                 PRODUCT
this month            DATE
Microsoft             ORG
today                 DATE
at least 129          CARDINAL
Windows               PRODUCT


 
***
### Start here afresh with text sitting in a dataframe column

In [40]:
final = joblib.load('final_df3.pkl')
text = final.text.reset_index(drop=True)
text

0       Friday Squid Blogging: Fishing for Jumbo Squid...
1       Interesting research: "Identifying Unintended ...
2       Analyzing IoT Security Best Practices\n\nNew r...
3       COVID-19 Risks of Flying\n\nI fly a lot. Over ...
4       Really interesting research: "An examination o...
                              ...                        
5120    — Zoom Video Communications Inc. has been lamb...
5121    — Miro is a company in the right place at the ...
5122    The Royal Bolton Hospital is among a growing n...
5123    Within a week, Blair’s tweet got the attention...
5124    The news: A group of experts has produced a pl...
Name: text, Length: 5125, dtype: object

In [80]:
df = pd.DataFrame(columns = ['entity_name', 'entity_type', 'doc_index'])

for n, t in enumerate(tqdm(text)):
    doc = nlp(t)
    for ent in doc.ents:
        ent_name = ent.text
        ent_label = ent.label_
        df = df.append({'entity_name': ent_name, 'entity_type': ent_label, 'doc_index': n}, ignore_index=True)

100%|████████████████████████████████████████████████████████████████████████████| 5125/5125 [3:00:59<00:00,  2.12s/it]


In [84]:
# Now the dataframe df contains all combinations of entity_name and entity_type and document.
# Count each entity and type only once for a document

df = df.drop_duplicates() 

# Now see how many documents does a given entity_name+entity_type combo appear in

entities = df.groupby(['entity_name', 'entity_type']).agg({'entity_name':'count'}).rename(columns={'entity_name':'COUNT'}).reset_index().sort_values(by = 'COUNT', ascending = False).reset_index(drop=True)

In [86]:
df

Unnamed: 0,entity_name,entity_type,doc_index
0,Friday,DATE,0
1,Jumbo Squid,PRODUCT,0
2,Posted,PRODUCT,0
3,"June 26, 2020",DATE,0
4,3:57 PM •,TIME,0
...,...,...,...
226454,one,CARDINAL,5124
226456,759786,CARDINAL,5124
226457,covid-19,DATE,5124
226458,this morning,TIME,5124


In [87]:
entities

Unnamed: 0,entity_name,entity_type,COUNT
0,one,CARDINAL,1633
1,first,ORDINAL,1595
2,two,CARDINAL,1353
3,today,DATE,944
4,US,GPE,700
...,...,...,...
54412,ISS Supply,ORG,1
54413,ISSA,ORG,1
54414,ITAM Forum,ORG,1
54415,ITER,ORG,1


In [88]:
entities.query('entity_type == "PERSON"')

Unnamed: 0,entity_name,entity_type,COUNT
32,Pierluigi Paganini,PERSON,272
36,Full Bio,PERSON,265
58,Threatpost,PERSON,201
63,Trump,PERSON,178
68,Learn,PERSON,168
...,...,...,...
54391,Ian Burkhart,PERSON,1
54392,Ian C. Bates,PERSON,1
54393,Ian Callum,PERSON,1
54394,Ian Goetz,PERSON,1


In [79]:
print(list(entities.query('entity_type == "PERSON"').entity_name))

['Assange', 'Threatpost', 'Full Bio', 'Mirai', 'Tropic Trooper', 'Lucifer', 'Wynonna', 'Drake', 'Schuchman', 'Earth Empusa', 'Williams', 'Zoox', 'Learn', 'Pierluigi Paganini', 'Magecart', 'Julian Assange', 'Jessica Davis', 'Trump', 'Tekya', 'Gabriel', 'Johnson', 'Steve Whittle', 'Valimail', 'Satori', 'Aaron Sterritt', 'Docker', 'Burkov', 'Titus', 'Logan Shwydiuk', 'Maor', 'Tesla', 'Tudor', 'Zuckerberg', 'Android', 'Kenneth Currin Schuchman', 'Govshteyn', 'Holt', 'Golang', 'Hussey', 'George Floyd', 'Ken the Bin', 'Cybercriminals', 'Jim Salter', 'bin', 'Marcu', 'Netsential', 'Chelsea Manning', 'Bieber', 'Leming', 'Lee', 'Mulan', 'Naro', 'MyBookie', 'Alexander', 'Earp', 'Okiru', 'Vinnik', 'Trevor Mahlmann', 'Brown', 'Brown-Philpot', 'Nicole', 'Abramoff', 'shopify', 'Markey', 'Jenkins', 'LulzSec', 'Laupp', 'Tags', 'Messenger', 'Miller', 'Hsu', 'Black Hat', 'Zoom', 'Dingle', 'DDoSecrets', 'Donald Trump', 'Wynonna Earp', 'Crozier', 'Wu', 'Yaniv', 'advancedKeyAlias', 'Constantin Sorinel Marcu

In [82]:
len(df)

226460

In [83]:
df

Unnamed: 0,entity_name,entity_type,doc_index
0,Friday,DATE,0
1,Jumbo Squid,PRODUCT,0
2,Posted,PRODUCT,0
3,"June 26, 2020",DATE,0
4,3:57 PM •,TIME,0
...,...,...,...
226455,US,GPE,5124
226456,759786,CARDINAL,5124
226457,covid-19,DATE,5124
226458,this morning,TIME,5124
