In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import pandas as pd
from glob import glob
from os import getcwd, path as base_path
from tqdm import tqdm
import re

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from spacy import displacy

In [9]:
from sys import path
PROJECT_PATH = getcwd()[:getcwd().find("notebooks")][:-1]
path.append(f"{PROJECT_PATH}\\src")

In [10]:
from utils import (get_words_contractions, 
                   find_urls_in_text, 
                   get_stopwords)

In [11]:
PROJECT_NAME = "wsj_tweet_scrapping"
PROJECT_PATH = getcwd()[: getcwd().find(PROJECT_NAME) + len(PROJECT_NAME)]
DATA_PATH = f"{PROJECT_PATH}\\data"
EXCEL_PATH = f"{DATA_PATH}\\excel"

FILE_FORMAT = "twint_wsj_logistics_news_"

In [12]:
news_files = glob(f"{EXCEL_PATH}\\*.xlsx")
news_files = [file_ for file_ in news_files if FILE_FORMAT in base_path.basename(file_)]

In [14]:
all_data = []
for i in tqdm(range(len(news_files))): 
    content = pd.read_excel(news_files[i])
    for val in content.values[:, :6]:
        all_data.append(val)

100%|██████████████████████████████████████████████████████████████████████████████████| 94/94 [00:01<00:00, 50.06it/s]


In [15]:
cols = pd.read_excel(news_files[0]).columns
news_df = pd.DataFrame(all_data, columns=cols)
news_df = news_df.sort_values("date").reset_index(drop=True) 
news_df.head()

Unnamed: 0,data-item-id,data-conversation-id,date,tweet,url,text
0,138783141312741408,138783141312741408,2011-11-21,Chinese Media Firm Hit by Muddy Waters on.ws...,http://on.wsj.com/vMZ1s2,Focus Media Holding Ltd. lashed back Tuesday a...
1,138950373275738096,138950373275738096,2011-11-22,China Pushes Clean-Energy Agenda on.wsj.com/...,http://on.wsj.com/ty6fGV,BEIJING—China plans to push for more funding f...
2,139191305766055904,139191305766055904,2011-11-22,PCCW Prices Telecom Trust IPO at Low End on....,http://on.wsj.com/sP81tk,HONG KONG—Fixed-line telecom operator PCCW Ltd...
3,138978095397998592,138978095397998592,2011-11-22,Solar Companies Swing to Loss on.wsj.com/sH5...,http://on.wsj.com/sH5ICa,Chinese solar-products companies JA Solar Hold...
4,138931945504636896,138931945504636896,2011-11-22,China Doubles Size of Hong Kong Currency Swa...,http://on.wsj.com/uza2y6,SHANGHAI—China and Hong Kong said Tuesday they...


In [16]:
def clean_news(news, 
               remove_stopwords : bool = False, 
               process_contractions : bool = False, 
               remove_special_chars : bool = False,
               remove_all_special_chars : bool = False,
               remove_single_alpha_char_word : bool = False):
    
    urls_to_remove = find_urls_in_text(news)
    
    remove_texts = ["Sign up: With one click, get this newsletter "
                    "delivered to your inbox.",
                    "News and analysis on the world of logistics, "
                    "from supply chain to transport and technology. "
                    "Trouble viewing this email? View in web browser", 
                   "Write to him at paul.page@wsj.com. "
                    "Follow the WSJ Logistics Report team: @PaulPage, "
                    "@jensmithWSJ, @EEPhillips_WSJ, @CostasParis. "
                    "Follow the WSJ Logistics Report on Twitter at @WSJLogistics."
                   ]
    
    for text in remove_texts:
        news = news.replace(text, " ")
        
    for url in urls_to_remove:
        news = news.replace(url, " ")

    news = news.lower()

    if process_contractions:
        cleaned_news = ""
        contractions = get_words_contractions()
        for word in news.split():
            if word in contractions:
                word = contractions[word]
            cleaned_news += " " + word
        news = cleaned_news
        
    if remove_stopwords:
        cleaned_news = ""
        stopwords = get_stopwords(lang="english")
        for word in news.split():
            if word not in stopwords:
                cleaned_news += " " + word
        news = cleaned_news
         
    if remove_special_chars:
        if remove_all_special_chars:
            chars_to_keep = "abcdefghijklmnopqrstuvwxyz1234567890 "
        else:
            chars_to_keep = "abcdefghijklmnopqrstuvwxyz1234567890 .,''’"
        news = "".join([char_ if char_ in chars_to_keep else " " for char_ in news])
    
    if remove_single_alpha_char_word:
        cleaned_news = ""
        for word in news.split():
            if word.isdigit() or len(word) > 1:
                cleaned_news += " " + word
        news = cleaned_news

    news = " ".join(news.split())     

    return news

In [17]:
clean_news_text = news_df["text"].apply(lambda x: clean_news(str(x),
                                                             process_contractions = True, 
                                                             remove_stopwords = True))

In [18]:
news_df["clean_news_text"] = clean_news_text 

In [19]:
news_df.head()

Unnamed: 0,data-item-id,data-conversation-id,date,tweet,url,text,clean_news_text
0,138783141312741408,138783141312741408,2011-11-21,Chinese Media Firm Hit by Muddy Waters on.ws...,http://on.wsj.com/vMZ1s2,Focus Media Holding Ltd. lashed back Tuesday a...,focus media holding ltd. lashed back tuesday m...
1,138950373275738096,138950373275738096,2011-11-22,China Pushes Clean-Energy Agenda on.wsj.com/...,http://on.wsj.com/ty6fGV,BEIJING—China plans to push for more funding f...,beijing—china plans push funding clean-energy ...
2,139191305766055904,139191305766055904,2011-11-22,PCCW Prices Telecom Trust IPO at Low End on....,http://on.wsj.com/sP81tk,HONG KONG—Fixed-line telecom operator PCCW Ltd...,hong kong—fixed-line telecom operator pccw ltd...
3,138978095397998592,138978095397998592,2011-11-22,Solar Companies Swing to Loss on.wsj.com/sH5...,http://on.wsj.com/sH5ICa,Chinese solar-products companies JA Solar Hold...,chinese solar-products companies ja solar hold...
4,138931945504636896,138931945504636896,2011-11-22,China Doubles Size of Hong Kong Currency Swa...,http://on.wsj.com/uza2y6,SHANGHAI—China and Hong Kong said Tuesday they...,shanghai—china hong kong said tuesday doubled ...


In [37]:
news_df.to_excel(f"{EXCEL_PATH}\\"
                 f"wsj_logistics_news_{min(pd.to_datetime(news_df['date']).dt.date).year}"
                 f"-{max(pd.to_datetime(news_df['date']).dt.date).year}.xlsx", index=False)

In [20]:
news_df.iloc[100]["text"][:600]

"TOKYO—Las Vegas Sands Corp. Chief Executive Sheldon Adelson offered his views on the fight between Steve Wynn and his former Japanese partner, suggesting that Mr. Adelson's Las Vegas casino rival may have gone too far in ousting co-founder Kazuo Okada from Wynn Resorts Ltd. Asked to comment on allegations at the heart of dispute, Mr. Adelson said offering complimentary hotel rooms is common practice in the gambling industry. Wynn Resorts has said Mr. Okada's alleged attempts to win favors through such offers was illegal and... WSJ Membership Customer Service Tools & Features Ads More Dow Jones"

In [21]:
news_df.iloc[100]["clean_news_text"][:600]

"tokyo—las vegas sands corp. chief executive sheldon adelson offered views fight steve wynn former japanese partner, suggesting mr. adelson's las vegas casino rival may gone far ousting co-founder kazuo okada wynn resorts ltd. asked comment allegations heart dispute, mr. adelson said offering complimentary hotel rooms common practice gambling industry. wynn resorts said mr. okada's alleged attempts win favors offers illegal and... wsj membership customer service tools & features ads dow jones products"

In [157]:
# news_df.to_excel(f"{EXCEL_PATH}\\wsj-news-2019-2020.xlsx", index=False)

In [None]:
##

In [186]:
nlp = en_core_web_sm.load()

In [215]:
doc = nlp(news_df.iloc[100]["text"])
print([(X.text, X.label_) for X in doc.ents])

[('the first quarter', 'DATE'), ('the WSJ Logistics Report', 'ORG'), ('Costas', 'NORP'), ('Paris', 'GPE'), ('35%', 'PERCENT'), ('the first quarter from a year ago', 'DATE'), ('Australia', 'GPE'), ('Latin America', 'LOC'), ('China', 'GPE'), ('this year', 'DATE'), ('Brazil', 'GPE'), ('Vale S.A.’s', 'ORG'), ('JD.com', 'PERSON'), ('Chinese', 'NORP'), ('WSJ', 'ORG'), ('Shan Li', 'PERSON'), ('China', 'GPE'), ('JD.com', 'PERSON'), ('2', 'CARDINAL'), ('Alibaba Group Holding Ltd.', 'ORG'), ('China', 'GPE'), ('recent weeks', 'DATE'), ('more than 550', 'CARDINAL'), ('China', 'GPE'), ('the Nippon Steel & Sumitomo Metal Corp.', 'ORG'), ('Kashima', 'GPE'), ('Japan', 'GPE'), ('iPhone', 'ORG'), ('Asia', 'LOC'), ('Taiwan', 'GPE'), ('China', 'GPE'), ('Japanese', 'NORP'), ('Apple Inc.', 'ORG'), ('WSJ', 'ORG'), ('Takashi Mochizuki', 'PERSON'), ('Japan Display Inc.', 'ORG'), ('Japan', 'GPE'), ('Taiwanese', 'NORP'), ('TPK Holding Co.', 'ORG'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('China', 'GPE'), ('Beijing'

In [216]:
doc = nlp(news_df.iloc[100]["clean_news_text"])
print([(X.text, X.label_) for X in doc.ents])

[('first quarter', 'DATE'), ('wsj logistics report', 'ORG'), ('costas', 'NORP'), ('paris', 'GPE'), ('35%', 'PERCENT'), ('first quarter year ago', 'DATE'), ('latin america', 'LOC'), ('china', 'GPE'), ('brazil', 'GPE'), ('fallout vale s.a.’s', 'ORG'), ('jd.com', 'PERSON'), ('chinese', 'NORP'), ('wsj', 'ORG'), ('li', 'PERSON'), ('china', 'GPE'), ('2', 'CARDINAL'), ('alibaba group holding ltd.', 'ORG'), ('recent weeks', 'DATE'), ('550', 'CARDINAL'), ('china', 'GPE'), ('nippon steel & sumitomo metal corp.', 'ORG'), ('japan', 'GPE'), ('bloomberg news', 'ORG'), ('asia', 'LOC'), ('taiwan', 'GPE'), ('china', 'GPE'), ('japanese', 'NORP'), ('apple inc.', 'ORG'), ('wsj', 'ORG'), ('takashi mochizuki', 'PERSON'), ('lifeline japan display inc.', 'ORG'), ('japan', 'GPE'), ('taiwanese', 'NORP'), ('tpk holding co.', 'ORG'), ('japan', 'GPE'), ('japan', 'GPE'), ('china', 'GPE'), ('beijing', 'GPE'), ('u.s.', 'GPE'), ('russian', 'NORP'), ('united co.', 'ORG'), ('$200 million', 'MONEY'), ('kentucky', 'GPE'),

In [22]:
displacy.render(nlp(news_df.iloc[100]["clean_news_text"]), jupyter=True, style='ent')

NameError: name 'nlp' is not defined

In [218]:
displacy.render(nlp(news_df.iloc[100]["text"]), jupyter=True, style='ent')

In [226]:
doc.ents[0].label_, doc.ents[0].text

('DATE', 'first quarter')