In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import requests
import pandas as pd
from os import getcwd
import urllib3
from bs4 import BeautifulSoup
from typing import List
from tqdm import tqdm
import numpy as np
from time import sleep

In [3]:
PROJECT_NAME = "wsj_tweet_scrapping"
PROJECT_PATH = getcwd()[: getcwd().find(PROJECT_NAME) + len(PROJECT_NAME)]
DATA_PATH = f"{PROJECT_PATH}\\data"
EXCEL_PATH = f"{DATA_PATH}\\excel"

In [4]:
def extract_text_from_url(url):
    """Extract text from the URL."""
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text)
    http = urllib3.PoolManager()
    response = http.request(method="GET", url=url)
    soup = BeautifulSoup(response.data)
    complete_texts = ""
    for text in soup.find_all(name="p"):
        complete_texts += " " + text.text
    return complete_texts

In [5]:
def clean_text(text):
    """Clean input text."""
    discard_texts = ["\n", "\t", "\r", "\xa0"] 
    for word in discard_texts: 
        text = text.replace(word, " ")
    text = " ".join(text.split())
    return text

In [152]:
year = 2019
month = 12

In [153]:
file_name = f"twint_wsj_logistics_tweets_{year}_{str(month).rjust(2,'0')}"
monthly_tweets = pd.read_excel(f"{EXCEL_PATH}\\{file_name}.xlsx")
print(f"Found {len(monthly_tweets)} entries")

Found 100 entries


In [154]:
def extract_urls_from_tweet_source(text_soup : str):
    soup = BeautifulSoup(text_soup)
    urls = []
    for url in soup.find_all(name="a", attrs={"data-url":True}):
        url = url["data-url"]
        if "wsj" in url:
            urls.append(url)
    return urls

In [155]:
processed_tweets = []
for i in tqdm(range(len(monthly_tweets))):
    tweet = {}
    tweet["data-item-id"] = monthly_tweets["data-item-id"].iloc[i]
    tweet["data-conversation-id"] = monthly_tweets["data-conversation-id"].iloc[i]
    tweet["date"] = monthly_tweets["date"].iloc[i]
    tweet["tweet"] = monthly_tweets["tweet"].iloc[i]
    for url in extract_urls_from_tweet_source(monthly_tweets["all-data"].iloc[i]):
        tweet["url"] = url
        processed_tweets.append(tweet)
        
processed_tweets = pd.DataFrame(processed_tweets)
processed_tweets = processed_tweets.drop_duplicates().reset_index(drop = True)
print(f"Found {len(processed_tweets)} entries")

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 367.65it/s]

Found 27 entries





In [156]:
processed_tweets.head()

Unnamed: 0,data-item-id,data-conversation-id,date,tweet,url
0,1206966719274717184,1206966719274717184,2020-12-17,Today’s newsletter: Amazon’s Logistics Shado...,https://on.wsj.com/2M63Dn3
1,1211659495602044928,1211659495602044928,2020-12-30,Today’s newsletter: The top stories shaping ...,https://on.wsj.com/2F7QzK0
2,1206600682691796992,1206600682691796992,2020-12-16,Today’s Logistics Report: Competing at the C...,https://on.wsj.com/35stqxo
3,1205503378127765504,1205503378127765504,2020-12-13,Trading Over Tariffs; Crude’s Freight Rates;...,https://on.wsj.com/35kWwyq
4,1209473861655433217,1209473861655433217,2020-12-24,Today’s Logistics Report: Delivering the Hol...,https://on.wsj.com/2ELDO7L


In [157]:
monthly_texts = []
for url in tqdm(processed_tweets["url"].values):
    text = extract_text_from_url(url)
    text = clean_text(text)
    text = {"url": url, "text": text}
    monthly_texts.append(text) 
    sleep(1)

100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [00:53<00:00,  1.97s/it]


In [158]:
monthly_texts_df = pd.DataFrame(monthly_texts) 
monthly_texts_df = pd.merge(processed_tweets, monthly_texts_df, on="url", how="left")
file_name = f"twint_wsj_logistics_news_{year}_{str(month).rjust(2,'0')}"
print(f"Found {len(monthly_texts_df)} entries")
monthly_texts_df.drop_duplicates().to_excel(f"{EXCEL_PATH}\\{file_name}.xlsx", index=False)

Found 27 entries
