## NLP Class Assignment 5

In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [2]:
import multiprocessing 
num_processors = multiprocessing.cpu_count()

In [3]:
nlp = spacy.load('en_core_web_sm')
stopwords = set(nltk.corpus.stopwords.words('english'))

#### Read news data

In [4]:
news_path = 'https://storage.googleapis.com/msca-bdp-data-open/news/nlp_a_5_news.json'
news_df = pd.read_json(news_path, orient='records', lines=True)

print(f'Sample contains {news_df.shape[0]:,.0f} news articles')
news_df.head(2)

Sample contains 10,012 news articles


Unnamed: 0,url,date,language,title,text
0,http://kokomoperspective.com/obituaries/jon-w-horton/article_b6ba8e1e-cb9c-11eb-9868-fb11b88b9778.html,2021-06-13,en,Jon W. Horton | Obituaries | kokomoperspective.com,Jon W. Horton | Obituaries | kokomoperspective.comYou have permission to edit this article. EditCloseSign Up Log In Dashboard LogoutMy Account Dashboard Profile Saved items LogoutCOVID-19Click here for the latest local news on COVID-19HomeAbout UsContact UsNewsLocalOpinionPoliticsNationalStateAgricultureLifestylesEngagements/Anniversaries/WeddingsAutosEntertainmentHealthHomesOutdoorsSportsNFLNCAAVitalsObituariesAutomotivee-EditionCouponsGalleries74°...
1,https://auto.economictimes.indiatimes.com/news/auto-components/birla-precision-to-ramp-up-capacity-to-tap-emerging-opportunities-in-india/81254902,2021-02-28,en,"Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto","Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto We have updated our terms and conditions and privacy policy Click ""Continue"" to accept and continue with ET AutoAccept the updated privacy & cookie policyDear user, ET Auto privacy and cookie policy has been updated to align with the new data regulations in European Union. Please review and accept these changes below to continue using the website.You can see our privacy policy & our cookie ..."


#### Read Tweets data

In [5]:
tweets_path = 'https://storage.googleapis.com/msca-bdp-data-open/tweets/nlp_a_5_tweets.json'
tweets_df = pd.read_json(tweets_path, orient='records', lines=True)
print(f'Sample contains {tweets_df.shape[0]:,.0f} tweets')
tweets_df.head(2)

Sample contains 10,105 tweets


Unnamed: 0,id,lang,date,name,retweeted,text
0,1534565117614084096,en,2022-06-08,Low Orbit Tourist 🌍📷,,"Body &amp; Assembly - Halewood - United Kingdom\n🌍53.3504,-2.8352296,402m\n\nHalewood Body &amp; Assembly is a Jaguar Land Rover factory in Halewood, England, and forms the major part of the Halewood complex which is shared with Ford who manufacture transmissions at the site. [Wikipedia] https://t.co/LPmCnZIaVt"
1,1534565743429394439,en,2022-06-08,CompleteCar.ie,RT,"Land Rover Ireland has announced that the new Range Rover Sport starts at €114,150, now on @completecar:\n\nhttps://t.co/TjGUkL3FYr https://t.co/QdVaEiJkjO"


#### Data Preprocessing 

In [6]:
news_df.isna().sum()

url         0
date        0
language    0
title       0
text        0
dtype: int64

In [7]:
tweets_df.isna().sum()

id           0
lang         0
date         0
name         0
retweeted    0
text         0
dtype: int64

In [8]:
# Making sure that there are only english language articles and tweets
print(news_df['language'].nunique(), ',',  tweets_df['lang'].nunique())

1 , 1


In [9]:
# Dropping redundant columns
news_df = news_df[['title', 'text']]
tweets_df = tweets_df[['text']]

In [10]:
# Clean text 
# It is necessary to preserve the text to identify of relevant entities
# Hence, I did not remove any stop words
def clean(df, col):
    df[col] = df[col].str.strip()
    pattern = r'\||\n|(@\w+.*?)|(http\w\S+.*?)|(#\w+)'
    df[col] = df[col].apply(lambda x: re.sub(pattern, ' ', x)) 

In [11]:
clean(news_df,'title')

In [12]:
clean(news_df,'text')

In [13]:
clean(tweets_df,'text')

In [14]:
# Top company names using NER-NTLK, no segmentation
def ner_nltk(df, col):
    org = []
    for text in df[col]:
        ne_chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
        for chunk in ne_chunks:
            if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
                org.extend([c for c in chunk])

    org_counts = {}
    for o in org:
        if o[0] in org_counts:
            org_counts[o[0]] += 1
        else:
            org_counts[o[0]] = 1

    sorted_orgs = sorted(org_counts.items(), key=lambda x: x[1], reverse=True)
    top_orgs = sorted_orgs[:20]

    return top_orgs

In [15]:
ner_nltk(news_df,'title')

[('News', 675),
 ('Star', 292),
 ('Auto', 236),
 ('Online', 214),
 ('Mail', 213),
 ('Daily', 192),
 ('AutoSpies', 144),
 ('CoventryLive', 137),
 ('Business', 122),
 ('Automotive', 106),
 ('BMW', 95),
 ('Express', 94),
 ('NewsBreak', 92),
 ('Car', 92),
 ('Shropshire', 90),
 ('GMC', 87),
 ('ET', 77),
 ('UK', 77),
 ('Volkswagen', 74),
 ('Land', 66)]

In [16]:
# Using a sample of text (10% of the whole corpus), since the running time is really long
# Since the running time is very long, I did not combine title and text
news_df_sample = news_df.sample(n = 1000, random_state = 420)
ner_nltk(news_df_sample,'text')

[('LA', 1357),
 ('NYC', 1286),
 ('News', 1078),
 ('Princess', 1012),
 ('MailOnline', 925),
 ('Prince', 912),
 ('Kate', 893),
 ('VERY', 732),
 ('Queen', 626),
 ('UK', 623),
 ('Duke', 613),
 ('Royal', 599),
 ('Awards', 598),
 ('Of', 531),
 ('House', 517),
 ('US', 494),
 ('COVID', 482),
 ('Land', 458),
 ('THE', 431),
 ('Duchess', 429)]

In [17]:
ner_nltk(tweets_df,'text')

[('Land', 3441),
 ('Rover', 2796),
 ('BMW', 394),
 ('Discovery', 393),
 ('Motors', 314),
 ('General', 300),
 ('Jaguar', 294),
 ('LAND', 261),
 ('eBay', 223),
 ('UK', 204),
 ('Duke', 192),
 ('Duchess', 171),
 ('SHAMELESS', 157),
 ('Defender', 146),
 ('SUV', 136),
 ('Range', 119),
 ('Services', 114),
 ('Health', 106),
 ('Invictus', 94),
 ('ROVER', 89)]

In [18]:
# Top company names using NER-NTLK, with segmentation
def sent_tokenizer(df, col):
    new_col_name = col + '_sent_tokens'
    df[new_col_name] = df[col].apply(nltk.sent_tokenize)

def ner_nltk_sent(df, col):
    ORG=[]
    for row in df[col]:
        for sent in row:
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
                    ORG.extend([c for c in chunk])
                    
    org_counts = {}
    for org in ORG:
        if org[0] in org_counts:
            org_counts[org[0]] += 1
        else:
            org_counts[org[0]] = 1
    
    sorted_orgs = sorted(org_counts.items(), key=lambda x: x[1], reverse=True)
    top_orgs = sorted_orgs[:20]
    
    return top_orgs


In [19]:
sent_tokenizer(news_df,'title')
ner_nltk_sent(news_df,'title_sent_tokens')

[('News', 675),
 ('Star', 289),
 ('Online', 238),
 ('Mail', 237),
 ('Auto', 236),
 ('Daily', 192),
 ('AutoSpies', 144),
 ('CoventryLive', 137),
 ('Business', 122),
 ('Automotive', 106),
 ('BMW', 96),
 ('Express', 94),
 ('NewsBreak', 92),
 ('Car', 92),
 ('Shropshire', 90),
 ('GMC', 87),
 ('ET', 77),
 ('UK', 77),
 ('Volkswagen', 74),
 ('Land', 66)]

In [20]:
sent_tokenizer(news_df,'text')
sent_tokenizer(news_df_sample,'text')
ner_nltk_sent(news_df_sample ,'text_sent_tokens')

[('LA', 1356),
 ('NYC', 1286),
 ('News', 1084),
 ('Princess', 1019),
 ('MailOnline', 925),
 ('Prince', 913),
 ('Kate', 868),
 ('VERY', 730),
 ('Queen', 627),
 ('UK', 623),
 ('Duke', 620),
 ('Royal', 600),
 ('Awards', 598),
 ('Of', 531),
 ('House', 523),
 ('US', 491),
 ('COVID', 483),
 ('Land', 465),
 ('Duchess', 416),
 ('Philip', 415)]

In [21]:
sent_tokenizer(tweets_df,'text')
ner_nltk_sent(tweets_df,'text_sent_tokens')

[('Land', 3448),
 ('Rover', 2846),
 ('Discovery', 394),
 ('BMW', 390),
 ('Motors', 314),
 ('Jaguar', 302),
 ('General', 300),
 ('LAND', 262),
 ('eBay', 223),
 ('UK', 204),
 ('Duke', 192),
 ('Duchess', 171),
 ('SHAMELESS', 157),
 ('Defender', 146),
 ('SUV', 136),
 ('Range', 119),
 ('Services', 114),
 ('Health', 106),
 ('Invictus', 94),
 ('ROVER', 90)]

In [22]:
# Top company names using NER spaCy, no segmentation
def ner_spacy(df, col):
    entities = []
    labels = []
    for doc in nlp.pipe(df[col], batch_size=100):
        entities.extend([ent.text for ent in doc.ents if ent.label_ == 'ORG'])
        labels.extend([ent.label_ for ent in doc.ents if ent.label_ == 'ORG'])
    ent_df = pd.DataFrame({'Entities':entities, 'Labels':labels})
    ent_gpd = ent_df.groupby('Entities').count().sort_values(by='Labels', ascending=False).head(20)
    
    return ent_gpd

In [23]:
ner_spacy(news_df,'title')

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Ford,265
Hyundai,207
Star News,191
Chevrolet,165
Toyota,162
Honda,147
Shropshire Star,108
Automotive News,108
BMW,108
Express & Star,103


In [24]:
ner_spacy(tweets_df,'text')

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Land Rover,1041
Jaguar Land Rover,940
eBay,477
BMW,383
General Motors,292
"Mercedes-Benz, Citroen",285
Jaguar,175
Ford,116
Audi,99
Volvo,93


In [25]:
ner_spacy(tweets_df,'text')

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Land Rover,1041
Jaguar Land Rover,940
eBay,477
BMW,383
General Motors,292
"Mercedes-Benz, Citroen",285
Jaguar,175
Ford,116
Audi,99
Volvo,93


In [26]:
# Top company names using NER spaCy, with segmentation
def ner_spacy_sent(df, col):
    entities = []
    labels = []
    for row in df[col]:
        doc = nlp(' '.join(row))  # join the tokens of the row into a single string
        for ent in doc.ents:
            if ent.label_ == 'ORG':
                entities.append(ent.text)
                labels.append(ent.label_)
    ent_df = pd.DataFrame({'Entities': entities, 'Labels': labels})
    ent_gpd = ent_df.groupby('Entities').count().sort_values(by='Labels', ascending=False).head(20)
    return ent_gpd

In [29]:
temp = ner_spacy_sent(news_df, 'title_sent_tokens') 

# create an empty DataFrame
results_org_df = pd.DataFrame()  
results_org_df = results_org_df.append(temp)
temp

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Ford,265
Hyundai,207
Star News,191
Chevrolet,165
Toyota,162
Honda,147
Automotive News,108
BMW,108
Shropshire Star,106
Express & Star,103


In [31]:
temp = ner_spacy_sent(news_df_sample,'text_sent_tokens')

results_org_df = results_org_df.append(temp)
temp

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
MailOnline,864
Ford,643
COVID-19,642
Toyota,527
Instagram,405
Hyundai,400
Honda,391
Trump,376
BMW,369
Amazon,366


In [32]:
temp = ner_spacy_sent(tweets_df,'text_sent_tokens')

results_org_df = results_org_df.append(temp)
temp 

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Land Rover,1049
Jaguar Land Rover,951
eBay,476
BMW,383
General Motors,292
"Mercedes-Benz, Citroen",285
Jaguar,175
Ford,116
Audi,100
Volvo,94


In [33]:
# Top location names using NER-NTLK
def ner_nltk_loc(df, col):
    loc_counts = {}
    for text in df[col]:
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
            if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'GPE':
                loc = ' '.join([token[0] for token in chunk])
                loc_counts[loc] = loc_counts.get(loc, 0) + 1
    
    top_20_locs = sorted(loc_counts.items(), key=lambda x: x[1], reverse=True)[:20]
    
    return top_20_locs

In [34]:
ner_nltk_loc(news_df,'title')

[('Sale', 1951),
 ('British', 231),
 ('Prince', 144),
 ('New', 139),
 ('Winnipeg', 137),
 ('India', 126),
 ('Toronto', 118),
 ('London', 113),
 ('Roadshow', 110),
 ('Cambridge', 92),
 ('North York', 72),
 ('Driven', 68),
 ('U.S.', 67),
 ('News', 63),
 ('Calgary', 63),
 ('China', 58),
 ('Daily', 57),
 ('Taiwan', 57),
 ('Mississauga', 55),
 ('Land', 52)]

In [35]:
ner_nltk_loc(news_df_sample,'text')

[('London', 1160),
 ('Los Angeles', 941),
 ('British', 777),
 ('New York City', 774),
 ('New York', 570),
 ('India', 520),
 ('West', 450),
 ('Australia', 419),
 ('Miami', 416),
 ('California', 407),
 ('American', 380),
 ('China', 365),
 ('Malibu', 364),
 ('U.S.', 359),
 ('Facebook', 347),
 ('Britain', 343),
 ('Mexico', 339),
 ('Australian', 328),
 ('Paris', 328),
 ('Sydney', 323)]

In [36]:
ner_nltk_loc(tweets_df,'text')

[('Land', 1470),
 ('Russia', 180),
 ('British', 155),
 ('Jaguar', 124),
 ('Sussex', 119),
 ('India', 92),
 ('Zimbabwe', 86),
 ('New', 84),
 ('Ad', 83),
 ('Russian', 77),
 ('Audi', 70),
 ('Cambridge', 68),
 ('Car', 68),
 ('Britain', 64),
 ('Meghan', 64),
 ('Paracetamol', 64),
 ('LAND', 63),
 ('UPDATE', 54),
 ('Indian', 53),
 ('Netherlands', 52)]

In [37]:
# Top location names using spaCy
def ner_spacy_loc(df, col):
    entities = [ent.text for i in df[col] for ent in nlp(i).ents if ent.label_ == 'GPE']
    entity_counts = Counter(entities)
    top_20_locs = entity_counts.most_common(20)
    return top_20_locs

In [38]:
temp = ner_spacy_loc(news_df,'title') 

# create an empty DataFrame
results_loc_df = pd.DataFrame()  
results_loc_df = results_loc_df.append(temp)
temp

[('Carpages.ca', 1962),
 ('Ontario', 1265),
 ('British Columbia', 198),
 ('UK', 195),
 ('Manitoba', 181),
 ('Winnipeg', 137),
 ('India', 121),
 ('Toronto', 118),
 ('Alberta', 116),
 ('London', 112),
 ('Cambridge', 90),
 ('US', 82),
 ('Saskatchewan', 78),
 ('North York', 72),
 ('Calgary', 64),
 ('U.S.', 64),
 ('China', 59),
 ('Taiwan', 54),
 ('Kitchener', 44),
 ('Australia', 44)]

In [39]:
temp = ner_spacy_loc(news_df_sample,'text')

results_loc_df = results_loc_df.append(temp)
temp

[('LA', 1872),
 ('London', 1258),
 ('UK', 1221),
 ('US', 1039),
 ('Los Angeles', 997),
 ('New York City', 739),
 ('Hollywood', 595),
 ('India', 550),
 ('Miami', 476),
 ('Meghan', 475),
 ('New York', 467),
 ('Australia', 463),
 ('California', 437),
 ('West Hollywood', 432),
 ('Beverly Hills', 423),
 ('Sydney', 419),
 ('China', 383),
 ('Britain', 370),
 ('Mexico', 366),
 ('Malibu', 354)]

In [None]:
temp = ner_spacy_loc(tweets_df,'text')

results_loc_df = results_loc_df.append(temp)
temp

## Conclusions

Note: The NER-NLTK and SpaCy models take a significant amount of time to run on the entire corpus of text data. Therefore, to optimize the processing time, a sample of 10% of the corpus was used instead of the entire corpus. Additionally, to further save time, the analysis was performed separately on the titles and text data, instead of combining them into one dataset. This allowed for more efficient processing and analysis.

In [52]:
df1, df2, df3 = np.array_split(results_org_df, 3)

# display the three dataframes side by side
from IPython.display import display_html

html_str = ''
for df in [df1, df2, df3]:
    html_str += df.to_html()
display_html(html_str.replace('table', 'table style='display:inline''), raw=True)

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Ford,265
Hyundai,207
Star News,191
Chevrolet,165
Toyota,162
Honda,147
Automotive News,108
BMW,108
Shropshire Star,106
Express & Star,103

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
MailOnline,864
Ford,643
COVID-19,642
Toyota,527
Instagram,405
Hyundai,400
Honda,391
Trump,376
BMW,369
Amazon,366

Unnamed: 0_level_0,Labels
Entities,Unnamed: 1_level_1
Land Rover,1049
Jaguar Land Rover,951
eBay,476
BMW,383
General Motors,292
"Mercedes-Benz, Citroen",285
Jaguar,175
Ford,116
Audi,100
Volvo,94


Note: The above tables are displayed in order: Entities in News Titles, News Text, Tweets

Based on the tables above, we can conclude that the NER-NLTK model doesn't perform well in identifying organizations as it misidentifies a lot of other words such as News, LA, NYC, etc. However, for tweets, the model performs slightly better as it correctly identifies companies such as Land Rover, BMW, and Jaguar. Additionally, the SpaCy model performs much better than NER-NLTK in correctly identifying company names with few exceptions. The model's performance is also better when sentence segmentation is used compared to only word tokenization since the former helps in identifying the context in most cases.

Hence, **spaCy with Segmentation is the best performing model.**

In terms of company mentions, for news article titles, the most frequently mentioned company is Ford, followed by Hyundai, Chevrolet, Toyota, and Honda. In the case of news article text, the most frequently mentioned company is Ford, followed by Toyota, Hyundai, Honda, and BMW. For tweets, the most frequently mentioned company is Land Rover, followed by Jaguar, BMW, General Motors, Mercedes-Benz, and Ford.

In [54]:
df1, df2, df3 = np.array_split(results_loc_df, 3)

# display the three dataframes side by side
from IPython.display import display_html

html_str = ''
for df in [df1, df2, df3]:
    html_str += df.to_html()
display_html(html_str.replace('table', 'table style='display:inline''), raw=True)

Unnamed: 0,0,1
0,Carpages.ca,1962
1,Ontario,1265
2,British Columbia,198
3,UK,195
4,Manitoba,181
5,Winnipeg,137
6,India,121
7,Toronto,118
8,Alberta,116
9,London,112

Unnamed: 0,0,1
0,LA,1872
1,London,1258
2,UK,1221
3,US,1039
4,Los Angeles,997
5,New York City,739
6,Hollywood,595
7,India,550
8,Miami,476
9,Meghan,475

Unnamed: 0,0,1
0,UK,342
1,Russia,184
2,India,95
3,Meghan,78
4,Kibaki,76
5,Britain,69
6,Jamaica,59
7,Netherlands,45
8,💸,42
9,Zimbabwe,41


Note: The above tables are displayed in order: Entities in News Titles, News Text, Tweets

Based on the results, it can be observed that the NER-SpaCy model outperforms the NER-NLTK model in accurately identifying locations. The NER-NLTK model produces a lot of false positive results, such as identifying 'Land' as a location, when in fact it refers to the company Land Rover. In contrast, the SpaCy model shows very few exceptions in identifying locations correctly.

Hence, **spaCy with Segmentation is the best performing model.** 

In terms of the most frequently mentioned locations, the titles of news articles frequently mention Ontario as the top location, followed by British Columbia, Uk, and Manitoba. For the text in news articles, LA is the most frequently mentioned location, followed by London, UK, and US. Lastly, for tweets, UK is the most frequently mentioned location, followed by Russia, India, and Britain.