In [1]:
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
import re
import json

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [2]:
def read_snp_csv():
    snp500 = pd.read_csv('/Users/benschlagman/Desktop/UCL Year 3/Final Year Project/SNP/SNP Companies.csv')
    snp500ft = snp500['FT'].to_list()
    snp500Stock = snp500['Name'].to_list()
    return snp500ft, snp500Stock

In [3]:
def create_empty_dataframe():
    df = pd.DataFrame({'Date':[''],'Link':[''],'Title':[''],'Subtitle':[''], 'Stock':['']})
    return df

In [4]:
def scrape_ft_pages(snp500ft, snp500Stock):
    df = create_empty_dataframe()
    for ind in range(0,len(snp500ft)):
        page = requests.get(snp500ft[ind])
        soup = BeautifulSoup(page.text, 'lxml')
        for i in range (3,25):
            articles = soup.find_all('li', class_ = 'o-teaser-collection__item o-grid-row')
            for article in articles:
                try:
                    date = article.find('div', class_ = 'stream-card__date').text.strip()
                except:
                    pass
                try:
                    article_link = article.find('a', class_ = 'js-teaser-heading-link').get('href')
                    full_article_link = 'https://www.ft.com' + article_link
                    title = article.find('a', class_ = 'js-teaser-heading-link').text.strip()
                    subtitle = article.find('a', class_ = 'js-teaser-standfirst-link').text.strip()
                except:
                    pass
                df = df.append({'Date':date, 'Link':full_article_link, 'Title':title, 'Subtitle':subtitle, 'Stock':snp500Stock[ind]}, ignore_index=True)

            next_page = snp500ft[ind][:-1] + str(i)   
            url = next_page
            page = requests.get(url)
            soup = BeautifulSoup(page.text, 'lxml')
    return df

In [5]:
def clean_data(df):
    cleandf = df.drop_duplicates()
    cleandf = cleandf.drop_duplicates(subset='Subtitle', keep="first")
    cleandf = cleandf.drop_duplicates(subset='Title', keep="first")
    return cleandf

In [6]:
def save_to_csv(cleandf):
    cleandf.to_csv('snpwebscrape.csv')

In [7]:
def get_authors():
    df = pd.read_csv('snpwebscrape.csv')
    authors_list = []
    link_list = df['Link'].tolist()
    i = 0
    for link in link_list:
        try:
            page = requests.get(link,timeout=3)
            soup = BeautifulSoup(page.text, 'html.parser')
            page_data = [json.loads(x.string) for x in soup.find_all('script',type='application/ld+json')]
            authors = page_data[0]['author']
            auth = []
            for author in authors:
                auth.append(author['name'])
            authors_list.append(auth)
        except:
            authors_list.append('Error')
        i += 1
    return authors_list

In [8]:
snp500ft, snp500Stock = read_snp_csv()
df = create_empty_dataframe()
df = scrape_ft_pages(snp500ft, snp500Stock)
cleandf = clean_data(df)
save_to_csv(cleandf)
authors_list = get_authors()

nlp_cleandf = cleandf.copy() 
nlp_cleandf['Authors'] = authors_list
nlp_cleandf = nlp_cleandf[(nlp_cleandf['Authors'] != 'Error') & (nlp_cleandf['Authors'].str.len() > 0)]
nlp_cleandf.to_csv('snp_ft.csv')

In [9]:
nlp_cleandf

Unnamed: 0,Date,Link,Title,Subtitle,Stock,Authors
1,"Friday, 23 December, 2022",https://www.ft.com/content/4ff64604-a421-422c-...,Meta and Alphabet lose dominance over US digit...,Long-held duopoly that rules the $300bn market...,Apple Inc.,[Patrick McGee]
2,"Friday, 16 December, 2022",https://www.ft.com/content/0c2d56f7-a402-45ea-...,Apple moves to open up App Store as tough EU l...,Digital Markets Act represents biggest threat ...,Apple Inc.,"[Javier Espinoza, Tim Bradshaw, Patrick McGee]"
3,"Friday, 9 December, 2022",https://www.ft.com/content/8cd27d16-c996-4dc7-...,Apple to end employee gagging clauses after ac...,Company responds to pressure over claims staff...,Apple Inc.,[Patrick McGee]
4,"Thursday, 8 December, 2022",https://www.ft.com/content/681d117f-b8c7-4d2f-...,Little America — idiosyncratic stories of immi...,The Apple TV Plus series returns with Somalian...,Apple Inc.,[Dan Einav]
7,"Monday, 5 December, 2022",https://www.ft.com/content/f0284cb2-ee5c-4fe1-...,Covid chaos at Foxconn iPhone plant causes 29%...,World’s biggest contract electronics manufactu...,Apple Inc.,[Kathrin Hille]
8,"Saturday, 3 December, 2022",https://www.ft.com/content/c2a96807-f931-4c14-...,Tim Cook charm resolves Twitter spat yet China...,Apple chief placates Elon Musk but faces bigge...,Apple Inc.,[Patrick McGee]
9,"Friday, 2 December, 2022",https://www.ft.com/content/083e038c-9b10-45d1-...,Inside the Covid revolt at the Zhengzhou ‘iPho...,Foxconn plant workers tell FT of chaos that re...,Apple Inc.,"[Gloria Li, Ryan McMorrow, Nian Liu, Kathrin H..."
10,"Thursday, 1 December, 2022",https://www.ft.com/content/9eae6dfd-012f-4793-...,Sony’s Apple strategy and Jack Ma in Tokyo,The inside story on the Asia tech trends that ...,Apple Inc.,"[Kana Inagaki, Akito Tanaka, Ryan McMorrow, To..."
12,"Thursday, 1 December, 2022",https://www.ft.com/content/d691b35e-04e2-45eb-...,Elon Musk appears to reconcile with Apple afte...,Billionaire says he has resolved ‘misunderstan...,Apple Inc.,"[Hannah Murphy, Patrick McGee]"
15,"Tuesday, 29 November, 2022",https://www.ft.com/content/b0fc29dd-b21b-4e7a-...,Apple’s growth streak under threat as China’s ...,Worker revolt at Foxconn factory poses risk to...,Apple Inc.,[Patrick McGee]
