In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import requests
import pandas as pd
from os import getcwd
import urllib3
from bs4 import BeautifulSoup
from typing import List
from tqdm import tqdm
from time import sleep
import datetime as dt
from random import randint
from time import strptime, mktime, time

In [4]:
PROJECT_NAME = "wsj_tweet_scrapping"
PROJECT_PATH = getcwd()[: getcwd().find(PROJECT_NAME) + len(PROJECT_NAME)]
DATA_PATH = f"{PROJECT_PATH}\\data"
EXCEL_PATH = f"{DATA_PATH}\\excel"

In [5]:
def extract_text_from_url(url):
    """Extract text from the URL."""
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text)
    http = urllib3.PoolManager()
    response = http.request(method="GET", url=url)
    soup = BeautifulSoup(response.data)
    complete_texts = ""
    for text in soup.find_all(name="p"):
        complete_texts += " " + text.text
    return complete_texts

In [6]:
def clean_text(text):
    """Clean input text."""
    discard_texts = ["\n", "\t", "\r", "\xa0"] 
    for word in discard_texts: 
        text = text.replace(word, " ")
    text = " ".join(text.split())
    return text

In [7]:
def extract_urls_from_tweet_source(text_soup : str):
    soup = BeautifulSoup(text_soup)
    urls = []
    for url in soup.find_all(name="a", attrs={"data-url":True}):
        url = url["data-url"]
        if "wsj" in url:
            urls.append(url)
    return urls

In [9]:
failed_downloading_urls = []

In [16]:
dates = pd.date_range(start=dt.date(2009,1,1), end=dt.date(2011,11,1), freq=pd.offsets.MonthBegin(1))
dates = dates.sort_values(ascending=False)

In [18]:
t = tqdm(range(len(dates)))
for i in t:
    year = dates[i].year
    month = dates[i].month
    t.set_description(f"Iter : {i}, year : {year}, month : {month}")
    
    # reading raw tweets
    file_name = f"twint_wsj_logistics_tweets_{year}_{str(month).rjust(2,'0')}"
    monthly_tweets = pd.read_excel(f"{EXCEL_PATH}\\{file_name}.xlsx")
    
    if  len(monthly_tweets) == 0:
        continue
        
    # processing raw tweets
    processed_tweets = []
    for i in range(len(monthly_tweets)):
        tweet = {}
        tweet["data-item-id"] = monthly_tweets["data-item-id"].iloc[i]
        tweet["data-conversation-id"] = monthly_tweets["data-conversation-id"].iloc[i]
        tweet["date"] = monthly_tweets["date"].iloc[i]
        tweet["tweet"] = monthly_tweets["tweet"].iloc[i]
        for url in extract_urls_from_tweet_source(monthly_tweets["all-data"].iloc[i]):
            tweet["url"] = url
            processed_tweets.append(tweet)
    processed_tweets = pd.DataFrame(processed_tweets)
    processed_tweets = processed_tweets.drop_duplicates().reset_index(drop = True)
#     print(f"After processing : {len(processed_tweets)} entries")
    
    if len(processed_tweets) == 0:
        continue
        
    # fetching news articles
    monthly_texts = []
    for url in tqdm(processed_tweets["url"].values):
        try:
            text = extract_text_from_url(url)
            text = clean_text(text)
            text = {"url": url, "text": text}
            monthly_texts.append(text) 
            sleep(2)
        except Exception as e:
            failed_downloading_urls.append([year, month, url])
            print(f"failed extracting news, exception : {e}")
            
    
    # saving news article
    monthly_texts = pd.DataFrame(monthly_texts) 
    monthly_texts = pd.merge(processed_tweets, monthly_texts, on="url", how="left")
#     print(f"After extracting texts : {len(monthly_texts)} entries")
    
    file_name = f"twint_wsj_logistics_news_{year}_{str(month).rjust(2,'0')}"
    monthly_texts.drop_duplicates().to_excel(f"{EXCEL_PATH}\\{file_name}.xlsx", index=False)

Iter : 0, year : 2011, month : 11:   0%|                                                        | 0/35 [00:00<?, ?it/s]
  0%|                                                                                           | 0/32 [00:00<?, ?it/s][A
  3%|██▌                                                                                | 1/32 [00:03<01:35,  3.07s/it][A
  6%|█████▏                                                                             | 2/32 [00:06<01:36,  3.21s/it][A
  9%|███████▊                                                                           | 3/32 [00:11<01:49,  3.77s/it][A
 12%|██████████▍                                                                        | 4/32 [00:18<02:14,  4.80s/it][A
 16%|████████████▉                                                                      | 5/32 [00:22<01:57,  4.35s/it][A
 19%|███████████████▌                                                                   | 6/32 [00:25<01:45,  4.06s/it][A
 22%|██████████████

failed extracting news, exception : HTTPConnectionPool(host='www.wsj.com', port=80): Max retries exceeded with url: http://www.wsj.com/news/world/asia (Caused by ResponseError('too many redirects'))



 28%|███████████████████████▎                                                           | 9/32 [00:32<01:05,  2.85s/it][A
 31%|█████████████████████████▋                                                        | 10/32 [00:34<01:02,  2.83s/it][A
 34%|████████████████████████████▏                                                     | 11/32 [00:37<01:00,  2.88s/it][A
 38%|██████████████████████████████▊                                                   | 12/32 [00:41<01:00,  3.01s/it][A
 41%|█████████████████████████████████▎                                                | 13/32 [00:45<01:03,  3.32s/it][A
 44%|███████████████████████████████████▉                                              | 14/32 [00:48<00:59,  3.28s/it][A
 47%|██████████████████████████████████████▍                                           | 15/32 [00:53<01:05,  3.85s/it][A
 50%|█████████████████████████████████████████                                         | 16/32 [00:57<01:01,  3.84s/it][A
 53%|██████████

failed extracting news, exception : HTTPSConnectionPool(host='www.wsj.com', port=443): Max retries exceeded with url: https://www.wsj.com/asia (Caused by ResponseError('too many redirects'))



 24%|████████████████████                                                               | 8/33 [00:24<01:07,  2.72s/it][A
 27%|██████████████████████▋                                                            | 9/33 [00:27<01:08,  2.87s/it][A
 30%|████████████████████████▊                                                         | 10/33 [00:30<01:08,  2.97s/it][A
 33%|███████████████████████████▎                                                      | 11/33 [00:33<01:06,  3.01s/it][A
 36%|█████████████████████████████▊                                                    | 12/33 [00:36<01:04,  3.05s/it][A
 39%|████████████████████████████████▎                                                 | 13/33 [00:40<01:02,  3.13s/it][A
 42%|██████████████████████████████████▊                                               | 14/33 [00:43<00:59,  3.14s/it][A
 45%|█████████████████████████████████████▎                                            | 15/33 [00:46<00:56,  3.11s/it][A
 48%|██████████

In [12]:
pd.DataFrame(failed_downloading_urls, columns=["year", "month", "url"]).to_excel(f"{EXCEL_PATH}\\failed_download_news_url-{int(time())}.xlsx",
                                                                                 index=False) 