In [129]:
%load_ext autoreload
%autoreload 2

In [192]:
import pandas as pd
from glob import glob
from os import getcwd, path
from tqdm import tqdm
import re

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from spacy import displacy

In [131]:
from sys import path
PROJECT_PATH = getcwd()[:getcwd().find("notebooks")][:-1]
path.append(f"{PROJECT_PATH}\\src")

In [141]:
from utils import (get_words_contractions, 
                   find_urls_in_text, 
                   get_stopwords)

In [12]:
PROJECT_NAME = "wsj_tweet_scrapping"
PROJECT_PATH = getcwd()[: getcwd().find(PROJECT_NAME) + len(PROJECT_NAME)]
DATA_PATH = f"{PROJECT_PATH}\\data"
EXCEL_PATH = f"{DATA_PATH}\\excel"

FILE_FORMAT = "twint_wsj_logistics_news_"

In [14]:
news_files = glob(f"{EXCEL_PATH}\\*.xlsx")
news_files = [file_ for file_ in news_files if FILE_FORMAT in path.basename(file_)]

In [33]:
all_data = []
for i in tqdm(range(len(news_files))): 
    content = pd.read_excel(news_files[i])
    for val in content.values[:, :6]:
        all_data.append(val)

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 75.29it/s]


In [85]:
cols = pd.read_excel(news_files[0]).columns
news_df = pd.DataFrame(all_data, columns=cols)
news_df = news_df.sort_values("date").reset_index(drop=True) 
news_df.head()

Unnamed: 0,data-item-id,data-conversation-id,date,tweet,url,text
0,1080446429087305984,1080446429087305984,2019-01-02,The captain of car carrier Sincerity Ace rep...,https://on.wsj.com/2CJAA4d,https://www.wsj.com/articles/mitsui-ship-is-ab...
1,1081220232008617984,1081220232008617984,2019-01-04,Today’s newsletter: China Bites Apple; Spill...,https://on.wsj.com/2F8CPja,https://www.wsj.com/articles/todays-logistics-...
2,1082297091622429056,1082297091622429056,2019-01-07,Today's newsletter: Hiring for Logistics; Th...,https://on.wsj.com/2CWrp0v,"News and analysis on the world of logistics, f..."
3,1082590862540984064,1082590862540984064,2019-01-08,Shipping industry in U.K. faces logistics ni...,https://on.wsj.com/2FeT6TX,https://www.wsj.com/articles/u-k-finds-a-fault...
4,1082971472342195968,1082971472342195968,2019-01-09,J.B. Hunt is buying a New Jersey-based home-...,https://on.wsj.com/2CXK0cG,https://www.wsj.com/articles/j-b-hunt-builds-u...


In [209]:
def clean_news(news, 
               remove_stopwords : bool = False, 
               process_contractions : bool = False, 
               remove_special_chars : bool = False,
               remove_all_special_chars : bool = False,
               remove_single_alpha_char_word : bool = False):
    
    urls_to_remove = find_urls_in_text(news)
    
    remove_texts = ["Sign up: With one click, get this newsletter "
                    "delivered to your inbox.",
                    "News and analysis on the world of logistics, "
                    "from supply chain to transport and technology. "
                    "Trouble viewing this email? View in web browser", 
                   "Write to him at paul.page@wsj.com. "
                    "Follow the WSJ Logistics Report team: @PaulPage, "
                    "@jensmithWSJ, @EEPhillips_WSJ, @CostasParis. "
                    "Follow the WSJ Logistics Report on Twitter at @WSJLogistics."
                   ]
    
    for text in remove_texts:
        news = news.replace(text, " ")
        
    for url in urls_to_remove:
        news = news.replace(url, " ")

    news = news.lower()

    if process_contractions:
        cleaned_news = ""
        contractions = get_words_contractions()
        for word in news.split():
            if word in contractions:
                word = contractions[word]
            cleaned_news += " " + word
        news = cleaned_news
        
    if remove_stopwords:
        cleaned_news = ""
        stopwords = get_stopwords(lang="english")
        for word in news.split():
            if word not in stopwords:
                cleaned_news += " " + word
        news = cleaned_news
         
    if remove_special_chars:
        if remove_all_special_chars:
            chars_to_keep = "abcdefghijklmnopqrstuvwxyz1234567890 "
        else:
            chars_to_keep = "abcdefghijklmnopqrstuvwxyz1234567890 .,''’"
        news = "".join([char_ if char_ in chars_to_keep else " " for char_ in news])
    
    if remove_single_alpha_char_word:
        cleaned_news = ""
        for word in news.split():
            if word.isdigit() or len(word) > 1:
                cleaned_news += " " + word
        news = cleaned_news

    news = " ".join(news.split())     

    return news

In [210]:
clean_news_text = news_df["text"].apply(lambda x: clean_news(str(x),
                                                             process_contractions = True, 
                                                             remove_stopwords = True))

In [211]:
news_df["clean_news_text"] = clean_news_text 

In [212]:
news_df.head()

Unnamed: 0,data-item-id,data-conversation-id,date,tweet,url,text,clean_news_text
0,1080446429087305984,1080446429087305984,2019-01-02,The captain of car carrier Sincerity Ace rep...,https://on.wsj.com/2CJAA4d,https://www.wsj.com/articles/mitsui-ship-is-ab...,​a japanese-owned car carrier bound hawaii abl...
1,1081220232008617984,1081220232008617984,2019-01-04,Today’s newsletter: China Bites Apple; Spill...,https://on.wsj.com/2F8CPja,https://www.wsj.com/articles/todays-logistics-...,china’s economic stumbles reverberating strong...
2,1082297091622429056,1082297091622429056,2019-01-07,Today's newsletter: Hiring for Logistics; Th...,https://on.wsj.com/2CWrp0v,"News and analysis on the world of logistics, f...",› strong hiring transportation logistics secto...
3,1082590862540984064,1082590862540984064,2019-01-08,Shipping industry in U.K. faces logistics ni...,https://on.wsj.com/2FeT6TX,https://www.wsj.com/articles/u-k-finds-a-fault...,british policy makers calling shipping industr...
4,1082971472342195968,1082971472342195968,2019-01-09,J.B. Hunt is buying a New Jersey-based home-...,https://on.wsj.com/2CXK0cG,https://www.wsj.com/articles/j-b-hunt-builds-u...,j.b. hunt transport services inc. snapping ano...


In [213]:
news_df.iloc[100]["text"][:600]

'News and analysis on the world of logistics, from supply chain to transport and technology. Trouble viewing this email? View in web browser › The dry-bulk shipping sector at the heart of the global industrial economy is navigating stormy seas. Carriers stepped up their scrapping of the biggest bulk transport vessels in the first quarter, the WSJ Logistics Report’s Costas Paris writes, as shipping rates turned sharply downward in a business marked by heavy pricing volatility. Ship-broker BTIG says ship recycling was up 35% in the first quarter from a year ago, and most of the scrapped capacity '

In [214]:
news_df.iloc[100]["clean_news_text"][:600]

'› dry-bulk shipping sector heart global industrial economy navigating stormy seas. carriers stepped scrapping biggest bulk transport vessels first quarter, wsj logistics report’s costas paris writes, shipping rates turned sharply downward business marked heavy pricing volatility. ship-broker btig says ship recycling 35% first quarter year ago, scrapped capacity came capesize vessels—the biggest ships move iron, aluminum coal australia latin america china. bulk sector marked unforgiving economics, rates year production important brazil mining market diminished fallout vale s.a.’s dam disaster t'

In [157]:
# news_df.to_excel(f"{EXCEL_PATH}\\wsj-news-2019-2020.xlsx", index=False)

In [186]:
nlp = en_core_web_sm.load()

In [215]:
doc = nlp(news_df.iloc[100]["text"])
print([(X.text, X.label_) for X in doc.ents])

[('the first quarter', 'DATE'), ('the WSJ Logistics Report', 'ORG'), ('Costas', 'NORP'), ('Paris', 'GPE'), ('35%', 'PERCENT'), ('the first quarter from a year ago', 'DATE'), ('Australia', 'GPE'), ('Latin America', 'LOC'), ('China', 'GPE'), ('this year', 'DATE'), ('Brazil', 'GPE'), ('Vale S.A.’s', 'ORG'), ('JD.com', 'PERSON'), ('Chinese', 'NORP'), ('WSJ', 'ORG'), ('Shan Li', 'PERSON'), ('China', 'GPE'), ('JD.com', 'PERSON'), ('2', 'CARDINAL'), ('Alibaba Group Holding Ltd.', 'ORG'), ('China', 'GPE'), ('recent weeks', 'DATE'), ('more than 550', 'CARDINAL'), ('China', 'GPE'), ('the Nippon Steel & Sumitomo Metal Corp.', 'ORG'), ('Kashima', 'GPE'), ('Japan', 'GPE'), ('iPhone', 'ORG'), ('Asia', 'LOC'), ('Taiwan', 'GPE'), ('China', 'GPE'), ('Japanese', 'NORP'), ('Apple Inc.', 'ORG'), ('WSJ', 'ORG'), ('Takashi Mochizuki', 'PERSON'), ('Japan Display Inc.', 'ORG'), ('Japan', 'GPE'), ('Taiwanese', 'NORP'), ('TPK Holding Co.', 'ORG'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('China', 'GPE'), ('Beijing'

In [216]:
doc = nlp(news_df.iloc[100]["clean_news_text"])
print([(X.text, X.label_) for X in doc.ents])

[('first quarter', 'DATE'), ('wsj logistics report', 'ORG'), ('costas', 'NORP'), ('paris', 'GPE'), ('35%', 'PERCENT'), ('first quarter year ago', 'DATE'), ('latin america', 'LOC'), ('china', 'GPE'), ('brazil', 'GPE'), ('fallout vale s.a.’s', 'ORG'), ('jd.com', 'PERSON'), ('chinese', 'NORP'), ('wsj', 'ORG'), ('li', 'PERSON'), ('china', 'GPE'), ('2', 'CARDINAL'), ('alibaba group holding ltd.', 'ORG'), ('recent weeks', 'DATE'), ('550', 'CARDINAL'), ('china', 'GPE'), ('nippon steel & sumitomo metal corp.', 'ORG'), ('japan', 'GPE'), ('bloomberg news', 'ORG'), ('asia', 'LOC'), ('taiwan', 'GPE'), ('china', 'GPE'), ('japanese', 'NORP'), ('apple inc.', 'ORG'), ('wsj', 'ORG'), ('takashi mochizuki', 'PERSON'), ('lifeline japan display inc.', 'ORG'), ('japan', 'GPE'), ('taiwanese', 'NORP'), ('tpk holding co.', 'ORG'), ('japan', 'GPE'), ('japan', 'GPE'), ('china', 'GPE'), ('beijing', 'GPE'), ('u.s.', 'GPE'), ('russian', 'NORP'), ('united co.', 'ORG'), ('$200 million', 'MONEY'), ('kentucky', 'GPE'),

In [217]:
displacy.render(nlp(news_df.iloc[100]["clean_news_text"]), jupyter=True, style='ent')

In [218]:
displacy.render(nlp(news_df.iloc[100]["text"]), jupyter=True, style='ent')

In [226]:
doc.ents[0].label_, doc.ents[0].text

('DATE', 'first quarter')

In [234]:
def identify_label(text, label_type):
    label_text = []
    for x in nlp(text).ents:
        if x.label_ in label_type:
            label_text.append(x.text)
    return label_text

In [231]:
news_df.columns

Index(['data-item-id', 'data-conversation-id', 'date', 'tweet', 'url', 'text',
       'clean_news_text'],
      dtype='object')

In [241]:
identify_label(news_df.iloc[2]["clean_news_text"], label_type=["ORG"])

['›',
 'digital commerce transforming big',
 'wsj logistics report',
 'better news',
 'wsj',
 'pullback 40%',
 'mack trucks inc.',
 'wsj',
 'boeing co.',
 'airbus',
 'wsj',
 'plane',
 'boeing airbus',
 '› u.s.',
 'wsj',
 'wsj',
 'wsj',
 'petrobras',
 'wsj',
 'charlotte russe inc.',
 'wsj',
 'associated press',
 'dredge',
 'retailer target corp.',
 'minneapolis star-tribune',
 'european union',
 'nikkei asian review',
 'volvo',
 'reuters',
 'global container fleet',
 'lloyd’s list',
 'head union',
 'lars kastrup',
 'mack-cali realty corp.',
 'westfair business journal',
 'paul page']