In [129]:
%load_ext autoreload
%autoreload 2

In [126]:
import pandas as pd
from glob import glob
from os import getcwd, path
from tqdm import tqdm
import re
import spacy


In [131]:
from sys import path
PROJECT_PATH = getcwd()[:getcwd().find("notebooks")][:-1]
path.append(f"{PROJECT_PATH}\\src")

In [141]:
from utils import (get_words_contractions, 
                   find_urls_in_text, 
                   get_stopwords)

In [12]:
PROJECT_NAME = "wsj_tweet_scrapping"
PROJECT_PATH = getcwd()[: getcwd().find(PROJECT_NAME) + len(PROJECT_NAME)]
DATA_PATH = f"{PROJECT_PATH}\\data"
EXCEL_PATH = f"{DATA_PATH}\\excel"

FILE_FORMAT = "twint_wsj_logistics_news_"

In [14]:
news_files = glob(f"{EXCEL_PATH}\\*.xlsx")
news_files = [file_ for file_ in news_files if FILE_FORMAT in path.basename(file_)]

In [33]:
all_data = []
for i in tqdm(range(len(news_files))): 
    content = pd.read_excel(news_files[i])
    for val in content.values[:, :6]:
        all_data.append(val)

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 75.29it/s]


In [85]:
cols = pd.read_excel(news_files[0]).columns
news_df = pd.DataFrame(all_data, columns=cols)
news_df = news_df.sort_values("date").reset_index(drop=True) 
news_df.head()

Unnamed: 0,data-item-id,data-conversation-id,date,tweet,url,text
0,1080446429087305984,1080446429087305984,2019-01-02,The captain of car carrier Sincerity Ace rep...,https://on.wsj.com/2CJAA4d,https://www.wsj.com/articles/mitsui-ship-is-ab...
1,1081220232008617984,1081220232008617984,2019-01-04,Today’s newsletter: China Bites Apple; Spill...,https://on.wsj.com/2F8CPja,https://www.wsj.com/articles/todays-logistics-...
2,1082297091622429056,1082297091622429056,2019-01-07,Today's newsletter: Hiring for Logistics; Th...,https://on.wsj.com/2CWrp0v,"News and analysis on the world of logistics, f..."
3,1082590862540984064,1082590862540984064,2019-01-08,Shipping industry in U.K. faces logistics ni...,https://on.wsj.com/2FeT6TX,https://www.wsj.com/articles/u-k-finds-a-fault...
4,1082971472342195968,1082971472342195968,2019-01-09,J.B. Hunt is buying a New Jersey-based home-...,https://on.wsj.com/2CXK0cG,https://www.wsj.com/articles/j-b-hunt-builds-u...


In [96]:
find(news_df.iloc[0]["text"])

['https://www.wsj.com/articles/mitsui-ship-is-ablaze-in-middle-of-pacific-ocean-11546429972']

In [165]:
len("22323")

5

In [174]:
def clean_news(news, 
               remove_stopwords : bool = False, 
               process_contractions : bool = False, 
               remove_all_special_chars = False,
               remove_single_alpha_char_word : bool = False):
    
    urls_to_remove = find_urls_in_text(news)
    
    remove_texts = ["Sign up: With one click, get this newsletter "
                    "delivered to your inbox.",
                    "News and analysis on the world of logistics, "
                    "from supply chain to transport and technology. "
                    "Trouble viewing this email? View in web browser", 
                   "Write to him at paul.page@wsj.com. "
                    "Follow the WSJ Logistics Report team: @PaulPage, "
                    "@jensmithWSJ, @EEPhillips_WSJ, @CostasParis. "
                    "Follow the WSJ Logistics Report on Twitter at @WSJLogistics."
                   ]
    
    for text in remove_texts:
        news = news.replace(text, " ")
        
    for url in urls_to_remove:
        news = news.replace(url, " ")

    news = news.lower()

    if process_contractions:
        cleaned_news = ""
        contractions = get_words_contractions()
        for word in news.split():
            if word in contractions:
                word = contractions[word]
            cleaned_news += " " + word
        news = cleaned_news
        
    if remove_stopwords:
        cleaned_news = ""
        stopwords = get_stopwords(lang="english")
        for word in news.split():
            if word not in stopwords:
                cleaned_news += " " + word
        news = cleaned_news
         
    if remove_all_special_chars:
        chars_to_keep = "abcdefghijklmnopqrstuvwxyz1234567890 "
    else:
        chars_to_keep = "abcdefghijklmnopqrstuvwxyz1234567890 .,''’"
    
    news = "".join([char_ if char_ in chars_to_keep else " " for char_ in news])
    
    if remove_single_alpha_char_word:
        cleaned_news = ""
        for word in news.split():
            if word.isdigit() or len(word) > 1:
                cleaned_news += " " + word
        news = cleaned_news

    news = " ".join(news.split())     

    return news

In [175]:
clean_news_text = news_df["text"].apply(lambda x: clean_news(str(x),
                                                             process_contractions = True, 
                                                             remove_stopwords = True, 
                                                             remove_all_special_chars = True, 
                                                             remove_single_alpha_char_word = True))

In [176]:
news_df["clean_news_text"] = clean_news_text 

In [177]:
news_df.head()

Unnamed: 0,data-item-id,data-conversation-id,date,tweet,url,text,clean_news_text
0,1080446429087305984,1080446429087305984,2019-01-02,The captain of car carrier Sincerity Ace rep...,https://on.wsj.com/2CJAA4d,https://www.wsj.com/articles/mitsui-ship-is-ab...,japanese owned car carrier bound hawaii ablaze...
1,1081220232008617984,1081220232008617984,2019-01-04,Today’s newsletter: China Bites Apple; Spill...,https://on.wsj.com/2F8CPja,https://www.wsj.com/articles/todays-logistics-...,china economic stumbles reverberating strongly...
2,1082297091622429056,1082297091622429056,2019-01-07,Today's newsletter: Hiring for Logistics; Th...,https://on.wsj.com/2CWrp0v,"News and analysis on the world of logistics, f...",strong hiring transportation logistics sector ...
3,1082590862540984064,1082590862540984064,2019-01-08,Shipping industry in U.K. faces logistics ni...,https://on.wsj.com/2FeT6TX,https://www.wsj.com/articles/u-k-finds-a-fault...,british policy makers calling shipping industr...
4,1082971472342195968,1082971472342195968,2019-01-09,J.B. Hunt is buying a New Jersey-based home-...,https://on.wsj.com/2CXK0cG,https://www.wsj.com/articles/j-b-hunt-builds-u...,hunt transport services inc snapping another h...


In [178]:
clean_news("s", remove_single_alpha_char_word = True)

''

In [181]:
news_df.iloc[100]["text"][:400]

'News and analysis on the world of logistics, from supply chain to transport and technology. Trouble viewing this email? View in web browser › The dry-bulk shipping sector at the heart of the global industrial economy is navigating stormy seas. Carriers stepped up their scrapping of the biggest bulk transport vessels in the first quarter, the WSJ Logistics Report’s Costas Paris writes, as shipping '

In [183]:
news_df.iloc[100]["clean_news_text"][:207]

'dry bulk shipping sector heart global industrial economy navigating stormy seas carriers stepped scrapping biggest bulk transport vessels first quarter wsj logistics report costas paris writes shipping rates'

In [157]:
# news_df.to_excel(f"{EXCEL_PATH}\\wsj-news-2019-2020.xlsx", index=False)