In [59]:
import nltk
import pandas as pd
from collections import Counter

###### Reading the data

In [14]:
df = pd.read_pickle("news_cat.pkl")

In [15]:
df.head()

Unnamed: 0,crawled,language,text,title
0,2018-01-30T23:03:51.004+02:00,english,by Abhishek K Global Telehandler Market 2023 D...,Global Telehandler Market 2023 Demand by Segme...
1,2018-01-30T23:06:46.024+02:00,english,favorite this post 2014 Caterpillar 314E LCR h...,2014 Caterpillar 314E LCR
2,2018-01-30T23:18:35.023+02:00,english,By: MAX NISEN The Amazon health care threat ha...,"Amazon, Berkshire, JPMorgan health announcemen..."
3,2018-01-30T23:20:54.012+02:00,english,QR Code Link to This Post MONTHLY PUBLIC AUCTI...,2005 Caterpillar CB534D Tandem Vibratory Rolle...
4,2018-01-30T23:28:30.000+02:00,english,QR Code Link to This Post 2007 CATERPILLAR D4G...,2007 CATERPILLAR D4G LGP CAB SCREEN/SWEEPS - O...


In [16]:
df.language.unique()

array(['english'], dtype=object)

All news articles are in English

###### Filtering text which does not contain Caterpillar

In [17]:
df.shape

(100, 4)

In [18]:
df = df[df['text'].str.contains("Caterpillar")]
df.shape

(88, 4)

###### Filtering title rows which does not contain Caterpillar

In [20]:
df = df[df.title.str.contains("Caterpillar")]
df.shape

(41, 4)

###### Performing Name Entity Recognition on the Text (we are only including chunks where label equals ORGANIZATION)

In [53]:
entities = []
for text in df.text.tolist():
    entities_text = []
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
            if hasattr(chunk, 'label'):
                if chunk.label() == 'ORGANIZATION':
                    entities_text.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
    entities += entities_text

###### Performing Name Entity Recognition on the Title (we are only including chunks where label equals ORGANIZATION)

In [55]:
for text in df.title.tolist():
    entities_title = []
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
            if hasattr(chunk, 'label'):
                if chunk.label() == 'ORGANIZATION':
                    entities_title.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
    entities += entities_title

###### Printing most common names (including Caterpillar)

In [60]:
Counter(entities).most_common()

[('Caterpillar Inc.', 96),
 ('Caterpillar', 80),
 ('NYSE', 63),
 ('CAT', 46),
 ('Cat', 36),
 ('Company', 27),
 ('SEC', 23),
 ('Resource Industries', 20),
 ('Transportation', 20),
 ('Securities', 19),
 ('Exchange Commission', 19),
 ('Construction Industries', 19),
 ('Energy', 19),
 ('Financial Products', 19),
 ('Investment', 16),
 ('Lincolnian Online', 12),
 ('LLC', 12),
 ('News', 11),
 ('Ratings', 11),
 ('Motley Fool', 10),
 ('Capital Group', 8),
 ('CFO Bradley', 8),
 ('VIOLATION', 8),
 ('EPS', 8),
 ('Bank', 8),
 ('FMR', 8),
 ('WFG', 7),
 ('Vetr', 7),
 ('Credit Suisse Group', 6),
 ('Capital', 6),
 ('PAAR', 6),
 ('OSF', 6),
 ('Ledger Gazette', 6),
 ('COPYRIGHT', 5),
 ('Security National Trust', 5),
 ('NOT', 5),
 ('FY17', 5),
 ('Commercial', 4),
 ('National Bank', 4),
 ('Thomson Reuters', 4),
 ('UBS Group', 4),
 ('Reviewer', 4),
 ('Partners', 4),
 ('Jennison Associates', 4),
 ('Thalmann Financial Services Inc.', 4),
 ('YCharts', 4),
 ('FREE', 3),
 ('BMO', 3),
 ('Capital Markets', 3),
 ('