# Cleaning Wall Street Journal Logistic News

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from tqdm import tqdm
from os import getcwd, path as base_path
from spacy import displacy
from collections import Counter
import en_core_web_sm
from typing import List
import numpy as np

import io
import re

from typing import List

In [3]:
from sys import path
PROJECT_PATH = getcwd()[:getcwd().find("notebooks")][:-1]
path.append(f"{PROJECT_PATH}\\src")

In [4]:
from utils import (get_words_contractions, 
                   find_urls_in_text, 
                   get_stopwords)

In [5]:
PROJECT_NAME = "wsj_tweet_scrapping"
PROJECT_PATH = getcwd()[: getcwd().find(PROJECT_NAME) + len(PROJECT_NAME)]
DATA_PATH = f"{PROJECT_PATH}\\data"
EXCEL_PATH = f"{DATA_PATH}\\excel"

In [6]:
from_year = 2011
to_year = 2020
news_df = pd.read_excel(f"{EXCEL_PATH}\\wsj_logistics_news_{from_year}-{to_year}.xlsx")
news_df.dropna(inplace=True)
news_df.reset_index(drop=True, inplace=True)

In [7]:
print(f"Found {len(news_df)} news articles.")
news_df.head()

Found 2473 news articles.


Unnamed: 0,date,tweet,url,news
0,2020-09-24,Today’s newsletter - Cleaning Vehicle Emissi...,https://on.wsj.com/2EuJDdb,\n\tNews and analysis on the world of logisti...
1,2020-09-16,FedEx's Christmas in July; Probing Nikola Cl...,https://on.wsj.com/2Fsewzz,\n\tNews and analysis on the world of logisti...
2,2020-09-14,Shipping’s E-Commerce Drive; Short-Selling N...,https://on.wsj.com/3iv2qDW,\n\tNews and analysis on the world of logisti...
3,2020-09-10,Brexit’s New Alarms; Railroad Offloads Bidde...,https://on.wsj.com/3inRU1m,\n\tNews and analysis on the world of logisti...
4,2020-09-23,Airlines Turn to Freight; Driving Robot Truc...,https://on.wsj.com/32TvO1a,\n\tNews and analysis on the world of logisti...


In [8]:
news_df["news"] = news_df["news"].astype(str)
if "clean_news" in news_df.columns:
    news_df.drop("clean_news", axis = 1, inplace=True)

# Text Cleaning 

In [57]:
def identify_sentence_from_identifier(text : str, identifiers : List, split_at : str = ". "):
    """Identify sentences in text with identifiers."""
    sentences = text.split(split_at)
    if len(sentences) == 0:
        return
    
    identified_sentences = []
    for sentence in sentences:
        for identifier in identifiers:
            if identifier in sentence and sentence not in identified_sentences:
                identified_sentences.append(sentence.strip() + ".")
    return identified_sentences

In [58]:
def clean_news(news, 
               remove_stopwords : bool = False, 
               process_contractions : bool = False, 
               remove_special_chars : bool = False,
               remove_all_special_chars : bool = False,
               remove_single_alpha_char_word : bool = False):
    
    """Can be used to clean news article."""
    
    # to remove urls within text
    urls_to_remove = find_urls_in_text(news)     
    for url in urls_to_remove:
        news = news.replace(url, " ")
    
    # to remove unwanted text
    discard_texts = ["\n", "\t", "\r", "\xa0"] 
    for word in discard_texts: 
        news = news.replace(word, " ")
        
    # truncating articles from start of below identifiers
    truncate_from_identifiers = ["Sign up here.", 
                                 "Write to"]
    for identifier in truncate_from_identifiers:
        idx = re.search(identifier, news)
        if idx:
            idx = idx.span()[0]
            news = news[:idx]
    
    
    remove_sentences_with_identifiers = ["News and analysis on the world", 
                                         "Trouble viewing this email?",
                                         "Enter News, Quotes",
                                         "Read more",
                                        "Get your supply chain ready for", 
                                        "Explore insights on how to master", 
                                        "Follow the WSJ Logistics Report",
                                        "Sign up: With one click, get this newsletter"
                                        "delivered to your inbox",
                                        "Trouble viewing this email? View in web browser", 
                                        "Write to him at paul.page@wsj.com. ",
                                         "is editor of WSJ", 
                                         "Write to him",
                                         
                   ]
    remove_sentences = identify_sentence_from_identifier(text=news,
                                                         identifiers=remove_sentences_with_identifiers)
    for sentence in remove_sentences:
        news = news.replace(sentence, " ")
            
#     news = news.lower()

#     if process_contractions:
#         cleaned_news = ""
#         contractions = get_words_contractions()
#         for word in news.split():
#             if word in contractions:
#                 word = contractions[word]
#             cleaned_news += " " + word
#         news = cleaned_news
        
#     if remove_stopwords:
#         cleaned_news = ""
#         stopwords = get_stopwords(lang="english")
#         for word in news.split():
#             if word not in stopwords:
#                 cleaned_news += " " + word
#         news = cleaned_news
         
#     if remove_special_chars:
#         if remove_all_special_chars:
#             chars_to_keep = "abcdefghijklmnopqrstuvwxyz1234567890 "
#         else:
#             chars_to_keep = "abcdefghijklmnopqrstuvwxyz1234567890 .,''’"
#         news = "".join([char_ if char_ in chars_to_keep else " " for char_ in news])
    
#     if remove_single_alpha_char_word:
#         cleaned_news = ""
#         for word in news.split():
#             if word.isdigit() or len(word) > 1:
#                 cleaned_news += " " + word
#         news = cleaned_news

    news = " ".join(news.split())     

    return news

# Some minor specific cleaning steps

In [168]:
# 1. Remove sentences with word "PHOTO" (photographer name, etc)
tags_split = {"PHOTO" : "\n"} # {tag : split_identifier}
clean_texts = []
for text in news_df["news"]:
    for tag in tags_split.keys():
        sentences = identify_sentence_from_identifier(text = text, identifiers=[tag])
        remove_sentences_ = []
        for i, sentence in enumerate(sentences):
            for sub_sentence in sentence.split(tags_split[tag]):
                if tag in sub_sentence:
                    remove_sentences_.append(sub_sentence)
    if len(remove_sentences_) > 0:
        for sentence in remove_sentences_:
            text = text.replace(sentence, " ")
    clean_texts.append(text)

In [169]:
# to remove news channel from news (helps in filtering out news channel name from company names identified in news articles)
all_news_channels = []
for text in clean_texts:
    channels = np.unique(re.findall(r"\(.*?\)", text))
    all_news_channels.append(channels)
all_news_channels = [channel for channels in all_news_channels for channel in channels]
all_news_channels = np.unique(all_news_channels, return_counts=True)
all_news_channels = {channel:count for channel, count in zip(all_news_channels[0], all_news_channels[1])}
all_news_channels = sorted(all_news_channels.items(), key=lambda x : x[1], reverse=True)

main_news_channels = [channel[0] for channel in all_news_channels if channel[1] >= 5] 

# removing news channel names from news articles
clean_texts_ = []
for text in clean_texts:
    for news_channel in main_news_channels:
        text = text.replace(news_channel, " ")
    clean_texts_.append(text)
clean_texts = clean_texts_
del clean_texts_

# General major cleaning steps

In [172]:
clean_texts = [clean_news(text) for text in clean_texts]

In [173]:
news_df["news_cleaned"] = clean_texts

In [174]:
news_df.head()

Unnamed: 0,date,tweet,url,news,news_cleaned
0,2020-09-24,Today’s newsletter - Cleaning Vehicle Emissi...,https://on.wsj.com/2EuJDdb,\n\tNews and analysis on the world of logisti...,California is taking an aggressive step to ove...
1,2020-09-16,FedEx's Christmas in July; Probing Nikola Cl...,https://on.wsj.com/2Fsewzz,\n\tNews and analysis on the world of logisti...,FedEx is turning the surge in e-commerce deman...
2,2020-09-14,Shipping’s E-Commerce Drive; Short-Selling N...,https://on.wsj.com/3iv2qDW,\n\tNews and analysis on the world of logisti...,Container shipping lines are increasingly tryi...
3,2020-09-10,Brexit’s New Alarms; Railroad Offloads Bidde...,https://on.wsj.com/3inRU1m,\n\tNews and analysis on the world of logisti...,Fears over the fallout from a “hard” Brexit ar...
4,2020-09-23,Airlines Turn to Freight; Driving Robot Truc...,https://on.wsj.com/32TvO1a,\n\tNews and analysis on the world of logisti...,Just four of the world’s 30 largest passenger ...


In [177]:
file_name = f"{EXCEL_PATH}\\wsj_logistics_news_cleaned_{from_year}-{to_year}.xlsx"
news_df.to_excel(file_name, index=False)