In [219]:
#import the libraries

import pandas as pd
import gensim #the library for Topic Modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora,models
import pyLDAvis.gensim #LDA visualization Library

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')

In [220]:
#Extracting named entity from an article

from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [221]:
# features NER, POS tagging, dependency parsing, word vectors

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [222]:
#Extracting article from web
ny_bb = url_to_string('https://www.blacklinegrp.com/blog/issues-facing-the-manufacturing-industry-2020')
article = nlp(ny_bb)
#len(article.ents)
article.ents

(10,
 2020,
 height="1,
 width="1,
 style="display:none,
 alt=,
 763,
 The BLG Way,
 10,
 2020,
 26,
 2020,
 2020,
 six months,
 Australian,
 10,
 2020,
 Manufacturing Practice Leader,
 John Madsen,
 John,
 over 40 years,
 the Black Line Group,
 first,
 1,
 March,
 the National Association of Manufacturers,
 NAM,
 78%,
 53%,
 operations 35.5%,
 John,
 about two,
 earlier this year,
 First,
 three,
 2,
 2020,
 The Reshoring Institute,
 the coming years,
 ”How,
 last fall,
 97%,
 the Reshoring Institute,
 U.S.,
 John,
 7,
 3,
 U.S.,
 Today,
 Broaden Your Search,
 Today,
 Retention,
 John,
 Saturday,
 6,
 4,
 Every year,
 every year,
 John,
 seven,
 2020,
 OSHA Insurance Right-to-,
 John,
 first,
 six,
 Navigating Changing Manufacturing Industry Laws,
 One,
 9,
 5,
 two,
 2020:1,
 CapacityHaving,
 John,
 7 Things to Consider When Your Manufacturing Facility,
 the R&D Tax Credit,
 ManufacturingMany,
 John identifies,
 four,
 second,
 the R&D Tax Credit,
 9,
 6,
 First,
 2020,
 some year-en

Cleaning of data

In [223]:
# Cleaning of data
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    #word = ' '.join([word for word in text.lower().split() if not word.isdigit()]) 
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [224]:
# Get labels counter
articles = clean(str(word))
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 36,
         'DATE': 22,
         'WORK_OF_ART': 6,
         'NORP': 1,
         'ORG': 14,
         'PERSON': 19,
         'ORDINAL': 5,
         'PERCENT': 4,
         'GPE': 5,
         'MONEY': 2,
         'EVENT': 2,
         'LAW': 4,
         'PRODUCT': 1,
         'FAC': 1})

In [225]:
items = [x.text for x in article.ents]
Counter(items).most_common()

[('John', 14),
 ('2020', 9),
 ('U.S.', 5),
 ('10', 4),
 ('the R&D Tax Credit', 4),
 ('9', 3),
 ('the Black Line Group', 2),
 ('first', 2),
 ('First', 2),
 ('three', 2),
 ('2', 2),
 ('3', 2),
 ('Today', 2),
 ('6', 2),
 ('height="1', 1),
 ('width="1', 1),
 ('style="display:none', 1),
 ('alt=', 1),
 ('763', 1),
 ('The BLG Way', 1),
 ('26', 1),
 ('six months', 1),
 ('Australian', 1),
 ('Manufacturing Practice Leader', 1),
 ('John Madsen', 1),
 ('over 40 years', 1),
 ('1', 1),
 ('March', 1),
 ('the National Association of Manufacturers', 1),
 ('NAM', 1),
 ('78%', 1),
 ('53%', 1),
 ('operations 35.5%', 1),
 ('about two', 1),
 ('earlier this year', 1),
 ('The Reshoring Institute', 1),
 ('the coming years', 1),
 ('”How', 1),
 ('last fall', 1),
 ('97%', 1),
 ('the Reshoring Institute', 1),
 ('7', 1),
 ('Broaden Your Search', 1),
 ('Retention', 1),
 ('Saturday', 1),
 ('4', 1),
 ('Every year', 1),
 ('every year', 1),
 ('seven', 1),
 ('OSHA Insurance Right-to-', 1),
 ('six', 1),
 ('Navigating Chan

In [226]:
# get the problems for the manufacture industries

sentences = [x for x in article.sents]

print(sentences[16])
print("\n")
print(sentences[17])
print("\n")
print(sentences[23])
print("\n")
print(sentences[25])
print("\n")
print(sentences[26])
print("\n")
print(sentences[27])
print("\n")
print(sentences[36])
print("\n")
print(sentences[37])


Murder hornets and locust swarms.


And the biggest of all, a global pandemic, rightfully continues to make headlines.


His first-hand knowledge of manufacturing makes him uniquely qualified to help manufacturers, whether he’s solving industry challenges or finding qualified R&D Tax Credits in their business.


Responding to COVID-19 According to a March Survey of the National Association of Manufacturers (NAM): 78% of manufacturers expect that the pandemic will have a financial impact on their business 53% of manufacturers anticipate a change in operations 35.5% of manufacturers are facing supply chain disruptions


No surprise, the effects of COVID-19 have impacted manufacturing, top to bottom.


It’s hard, even with a business plan, to adequately address the unpredictable and rapid variables of the outbreak: quarantining, restricting travel options, closing schools, disrupting supply chains, etc.


As early as last fall (well before the coronavirus), 97% of executives said they’d c

In [227]:
# Visualize spaCy's guess at the syntactic structure of a sentence
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')