In [28]:
import pandas as pd
from tqdm import tqdm
from os import getcwd, path as base_path
from typing import List

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import numpy as np

In [29]:
from sys import path
PROJECT_PATH = getcwd()[:getcwd().find("notebooks")][:-1]
path.append(f"{PROJECT_PATH}\\src")

In [30]:
PROJECT_NAME = "wsj_tweet_scrapping"
PROJECT_PATH = getcwd()[: getcwd().find(PROJECT_NAME) + len(PROJECT_NAME)]
DATA_PATH = f"{PROJECT_PATH}\\data"
EXCEL_PATH = f"{DATA_PATH}\\excel"

In [31]:
from_year = 2011
to_year = 2020
file_name = f"{EXCEL_PATH}\\wsj_logistics_news_companies_identified_{from_year}-{to_year}.xlsx"
news_df = pd.read_excel(file_name)
news_df.dropna(inplace=True)

In [32]:
news_df.head()

Unnamed: 0,date,tweet,url,news,news_cleaned,companies_identified
0,2020-09-24,Today’s newsletter - Cleaning Vehicle Emissi...,https://on.wsj.com/2EuJDdb,\n\tNews and analysis on the world of logisti...,California is taking an aggressive step to ove...,"['Alejandro Lazo', 'Amazon', 'Bpost’s North Am..."
1,2020-09-16,FedEx's Christmas in July; Probing Nikola Cl...,https://on.wsj.com/2Fsewzz,\n\tNews and analysis on the world of logisti...,FedEx is turning the surge in e-commerce deman...,"['A.P. Moller-Maersk', 'Asda', 'D.G. Yuengling..."
2,2020-09-14,Shipping’s E-Commerce Drive; Short-Selling N...,https://on.wsj.com/3iv2qDW,\n\tNews and analysis on the world of logisti...,Container shipping lines are increasingly tryi...,"['Amazon', 'Arm Holdings', 'CMA CGM', 'Europea..."
3,2020-09-10,Brexit’s New Alarms; Railroad Offloads Bidde...,https://on.wsj.com/3inRU1m,\n\tNews and analysis on the world of logisti...,Fears over the fallout from a “hard” Brexit ar...,"['Air Transport Services Group', 'Albany Times..."
4,2020-09-23,Airlines Turn to Freight; Driving Robot Truc...,https://on.wsj.com/32TvO1a,\n\tNews and analysis on the world of logisti...,Just four of the world’s 30 largest passenger ...,"['Asiana Airlines', 'Beverage Daily', 'Brookfi..."


# Identifying Disruption Keywords in News

In [33]:
disruption_keywords = ["Theft",
                        "Close",
                        "Closing stores",
                        "Closing operations",
                        "Bankrupt",
                        "Challenge",
                        "Supplier",
                        "Warehouse",
                        "warehousing",
                        "Delivery",
                        "Recall",
                        "Manufacturing error",
                        "Restructure",
                        ["Budget", "cuts"],
                        "Strike",
                        "Pandemic",
                        "Loss",
                        "Failure",
                        "Demand surge",
                        "Trade war",
                        "Natural disaster",
                        "Embargo",
                        "Hacking",
                        "Cyber attack",
                        "Quality management",
                        "Distribution",
                        "planning",
                        "Shutdown",
                        "catastrophe",
                        "complication",
                        "delay",
                        "breakdown",
                        "bullwhip",
                        "inventory shortage",
                        "halt",
                        "unable",
                        "inability",
                        "disruption",
                        "blockade",
                        "miscommunication",
                        "lack of coordination",
                        "loss of information",
                        "unintended",
                        "transport issue",
                        "network issue",
                        "loss of workers",
                        "workforce",
                        "oversupply",
                        "discontinue",
                        "infrastructure",
                        "consequence",
                        "Shortage",
                       "Theft", 
                       "Close", 
                       "closing stores",
                       "closing operations", 
                       "Bankrupt",
                       "bankruptcy",
                       ["Challenges", "suppliers"],
                       "Warehouse", "warehousing",
                       "Delivery",
                       "Recall",
                       "Manufacturing error",
                       "Restructure",
                       "Budget cuts",
                       "Strike",
                       "Pandemic",
                       "Loss",
                       "Failure",
                       "Demand surge",
                       "Trade war",
                       "Natural disasters",
                       "Embargo",
                       "Hacking",
                       "cyber attacks",
                       "Quality management",
                       "Distribution", 
                       "planning"]
l = []
for _ in disruption_keywords:
    if _ not in l:
        l.append(_)
disruption_keywords = l
del l

In [34]:
lemmatizer = WordNetLemmatizer() # wordnet lemmatizer object

In [35]:
# lemmatizing disruption keywords
lemmatized_disruption_keywords = [] 
for word in disruption_keywords:
    join = False
    if not isinstance(word, list):
        join = True
        word = word.split()
    word = [lemmatizer.lemmatize(word_i, pos="v") for word_i in word]
    if join:
        word = " ".join(word)
    lemmatized_disruption_keywords.append(word)        

In [36]:
for i in range(len(disruption_keywords)):
    print(f"{disruption_keywords[i]} -> {lemmatized_disruption_keywords[i]}")

Theft -> Theft
Close -> Close
Closing stores -> Closing store
Closing operations -> Closing operations
Bankrupt -> Bankrupt
Challenge -> Challenge
Supplier -> Supplier
Warehouse -> Warehouse
warehousing -> warehouse
Delivery -> Delivery
Recall -> Recall
Manufacturing error -> Manufacturing error
Restructure -> Restructure
['Budget', 'cuts'] -> ['Budget', 'cut']
Strike -> Strike
Pandemic -> Pandemic
Loss -> Loss
Failure -> Failure
Demand surge -> Demand surge
Trade war -> Trade war
Natural disaster -> Natural disaster
Embargo -> Embargo
Hacking -> Hacking
Cyber attack -> Cyber attack
Quality management -> Quality management
Distribution -> Distribution
planning -> plan
Shutdown -> Shutdown
catastrophe -> catastrophe
complication -> complication
delay -> delay
breakdown -> breakdown
bullwhip -> bullwhip
inventory shortage -> inventory shortage
halt -> halt
unable -> unable
inability -> inability
disruption -> disruption
blockade -> blockade
miscommunication -> miscommunication
lack of co

In [43]:
def identify_disruption_keywords_in_news(news : str,
                                         lemmatizer : WordNetLemmatizer, 
                                         lemmatized_disruption_keywords : List):
    
    """
    Identify disruption words in the news.
    
    Returns
    -------
    indexes of disruptive keywords.
    """
    news = news.lower()
    news = news.split()
    lemmatized_news = [lemmatizer.lemmatize(word, pos="v") for word in news]
    
    def find_idx(l, word):
        try:
            idx = l.index(word)
        except:
            idx = -1
        return idx
    
    idxs = {}
    for keyword in lemmatized_disruption_keywords:
        if not isinstance(keyword, list):
            keyword = keyword.lower()
            flag = find_idx(lemmatized_news, keyword)
            if flag != -1:
                idxs[keyword] = flag
        else:  # if keyword is of type list (check presence of multiple words)
            flags = []
            for word in keyword:
                word = word.lower()
                flags.append(find_idx(lemmatized_news, word))
            if -1 not in flags:
                idxs[str(keyword)] = flags
                
    return idxs      

In [45]:
# identifying disruption keyword after lemmatizing cleaned news
disruption_keywords_identified = []
for i in tqdm(range(len(news_df))):
    disruption_keywords_identified.append(
        identify_disruption_keywords_in_news(news_df["news_cleaned"].iloc[i], 
                                             lemmatizer=lemmatizer, 
                                             lemmatized_disruption_keywords = lemmatized_disruption_keywords))

100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:15<00:00, 155.45it/s]


In [47]:
disruption_keywords_identified[1300]

{'close': 801,
 'challenge': 425,
 'delivery': 779,
 'loss': 333,
 'distribution': 902,
 'plan': 920,
 'delay': 654,
 'infrastructure': 136,
 'bankruptcy': 805}

In [48]:
news_df["disruption_keywords_identified"] = [[word for word in disruption_keywords_identified[i].keys()] for i in range(len(disruption_keywords_identified))]
news_df["does_disruption_keywords_exist"] = [1 if len(list_words) > 0 else 0 for list_words in news_df["disruption_keywords_identified"]]

In [49]:
news_df.head()

Unnamed: 0,date,tweet,url,news,news_cleaned,companies_identified,disruption_keywords_identified,does_disruption_keywords_exist
0,2020-09-24,Today’s newsletter - Cleaning Vehicle Emissi...,https://on.wsj.com/2EuJDdb,\n\tNews and analysis on the world of logisti...,California is taking an aggressive step to ove...,"['Alejandro Lazo', 'Amazon', 'Bpost’s North Am...","[challenge, supplier, delivery, pandemic, plan...",1
1,2020-09-16,FedEx's Christmas in July; Probing Nikola Cl...,https://on.wsj.com/2Fsewzz,\n\tNews and analysis on the world of logisti...,FedEx is turning the surge in e-commerce deman...,"['A.P. Moller-Maersk', 'Asda', 'D.G. Yuengling...","[close, supplier, delivery, restructure, plan,...",1
2,2020-09-14,Shipping’s E-Commerce Drive; Short-Selling N...,https://on.wsj.com/3iv2qDW,\n\tNews and analysis on the world of logisti...,Container shipping lines are increasingly tryi...,"['Amazon', 'Arm Holdings', 'CMA CGM', 'Europea...","[challenge, supplier, delivery, restructure, s...",1
3,2020-09-10,Brexit’s New Alarms; Railroad Offloads Bidde...,https://on.wsj.com/3inRU1m,\n\tNews and analysis on the world of logisti...,Fears over the fallout from a “hard” Brexit ar...,"['Air Transport Services Group', 'Albany Times...","[close, supplier, pandemic, distribution, plan...",1
4,2020-09-23,Airlines Turn to Freight; Driving Robot Truc...,https://on.wsj.com/32TvO1a,\n\tNews and analysis on the world of logisti...,Just four of the world’s 30 largest passenger ...,"['Asiana Airlines', 'Beverage Daily', 'Brookfi...","[challenge, restructure, distribution, plan, w...",1


In [50]:
file_name = f"{EXCEL_PATH}\\wsj_logistics_news_companies_and_disruptive_keywords_identified_{from_year}-{to_year}.xlsx"
news_df.to_excel(f"{file_name}", index = False)

In [51]:
news_df.iloc[7]["news_cleaned"]

'A surprisingly robust peak season is taking shape at U.S. seaports. Container imports are flowing back into the U.S. in bigger volumes, the WSJ Logistics Report’s Costas Paris writes, as U.S. retailers rush to restock while consumer spending remains strong and shipping lines push capacity back into commercial trade lanes. The growth is accelerating at both the Atlantic and Pacific gateways but appears stronger recently on the West Coast, where retailers have faster access to domestic distribution channels primed for e-commerce. The surge comes after several retailers reported that stock shortages limited their sales growth as coronavirus-driven lockdowns wound down and shoppers opened their wallets. The demand is driving freight rates to new multi-year highs, boosting an unexpectedly strong financial year for shipping lines. Research group Sea Intelligence says sector-wide operating earnings reached $2.7 billion in the second quarter, and growing third-quarter volumes should add to th