In [29]:
import numpy as np
import pandas as pd
import requests
import math
from bs4 import BeautifulSoup
from time import sleep
from ipynb.fs.defs.job_postings_scraper import BaseScraper

Scrape details:

- how_long_ago: from the time of scraping (3 May 2023)
- text: text details of the post

In [60]:
class LinkedInPostScraper(BaseScraper):

    def __init__(self, n_posts, *args, **kwargs):
        super(LinkedInPostScraper, self).__init__(*args, **kwargs)
        self.n_posts = n_posts
        
    def scrape(self, sleep_duration=15, verbose=True):
        # Calculate the list of starting positions for posts, to iterate
        iter_list = []
        count_page = 0
        while count_page < self.n_posts:
            iter_list.append(count_page)
            count_page += 10
        
        # Get the scraping URL from the raw URL
        url_scrape = "{}&paginationStart={}"
        
        # For each page of the search, scrape all job IDs and append to the list
        for i in iter_list:
            # Request the website to scrape
            res = requests.get(url_scrape.format(self.raw_url, i))
            
            # Parse the html of that site
            soup = BeautifulSoup(res.text, 'html.parser')

            # Find all posts on the page, which are in "li" tags with class "mb-1"
            posts_on_this_page = soup.find_all("li", {"class": "mb-1"})
            
            len_page = len(posts_on_this_page)
            
            data_individual = {}
            
            # For each post found, extract the time posted and the text
            for j in range(len_page):
                if verbose:
                    print(f"Scraping post {j+1} of {len_page} on page {iter_list.index(i)+1}...")

                try:
                    data_individual["how_long_ago"] = posts_on_this_page[j].find("time", {"class": "flex-none"}).text.strip()
                except:
                    data_individual["how_long_ago"] = None
                
                try:
                    data_individual["text"] = posts_on_this_page[j].find("p", {"class": "attributed-text-segment-list__content"}).text.strip()
                except:
                    data_individual["text"] = None
                
                self.data.append(data_individual)
            
                # Reset the individual data point to blank, ready for scraping the next page
                data_individual = {}
            
            # Sleep if not the final iteration
            if i != iter_list[-1]:
                if verbose:
                    print(f"Sleeping for {sleep_duration} seconds...")
                sleep(sleep_duration)
        
        if verbose:
            print('Done scraping all data.')

In [43]:
def check_null_rows(df):
    return df[df.isnull().any(axis=1)]

## Scraping Commonwealth Bank posts

In [35]:
cba_posts = LinkedInPostScraper(
    100,
    'https://au.linkedin.com/organization-guest/api/feedUpdates/2848?paginationToken=0-1683168175429-16b40a8931ec77ea61c48afa9ed1d75c'
)

cba_posts.scrape()

cba_posts_df = cba_posts.save_data('data/posts_linkedin/cba_posts_linkedin.csv')

Scraping post 1 of 10 on page 1...
Scraping post 2 of 10 on page 1...
Scraping post 3 of 10 on page 1...
Scraping post 4 of 10 on page 1...
Scraping post 5 of 10 on page 1...
Scraping post 6 of 10 on page 1...
Scraping post 7 of 10 on page 1...
Scraping post 8 of 10 on page 1...
Scraping post 9 of 10 on page 1...
Scraping post 10 of 10 on page 1...
Sleeping for 15 seconds...
Scraping post 1 of 10 on page 11...
Scraping post 2 of 10 on page 11...
Scraping post 3 of 10 on page 11...
Scraping post 4 of 10 on page 11...
Scraping post 5 of 10 on page 11...
Scraping post 6 of 10 on page 11...
Scraping post 7 of 10 on page 11...
Scraping post 8 of 10 on page 11...
Scraping post 9 of 10 on page 11...
Scraping post 10 of 10 on page 11...
Sleeping for 15 seconds...
Scraping post 1 of 10 on page 21...
Scraping post 2 of 10 on page 21...
Scraping post 3 of 10 on page 21...
Scraping post 4 of 10 on page 21...
Scraping post 5 of 10 on page 21...
Scraping post 6 of 10 on page 21...
Scraping post 7 of

## Scraping Woolworths Group posts

In [39]:
wow_posts = LinkedInPostScraper(
    70,
    'https://au.linkedin.com/organization-guest/api/feedUpdates/295257?paginationToken=0-1683170025354-f5fbb4ba0f710d993a13ccb6a38856cc'
)

wow_posts.scrape()

wow_posts_df = wow_posts.save_data('data/posts_linkedin/wow_posts_linkedin.csv')

Scraping post 1 of 10 on page 1...
Scraping post 2 of 10 on page 1...
Scraping post 3 of 10 on page 1...
Scraping post 4 of 10 on page 1...
Scraping post 5 of 10 on page 1...
Scraping post 6 of 10 on page 1...
Scraping post 7 of 10 on page 1...
Scraping post 8 of 10 on page 1...
Scraping post 9 of 10 on page 1...
Scraping post 10 of 10 on page 1...
Sleeping for 15 seconds...
Scraping post 1 of 10 on page 11...
Scraping post 2 of 10 on page 11...
Scraping post 3 of 10 on page 11...
Scraping post 4 of 10 on page 11...
Scraping post 5 of 10 on page 11...
Scraping post 6 of 10 on page 11...
Scraping post 7 of 10 on page 11...
Scraping post 8 of 10 on page 11...
Scraping post 9 of 10 on page 11...
Scraping post 10 of 10 on page 11...
Sleeping for 15 seconds...
Scraping post 1 of 10 on page 21...
Scraping post 2 of 10 on page 21...
Scraping post 3 of 10 on page 21...
Scraping post 4 of 10 on page 21...
Scraping post 5 of 10 on page 21...
Scraping post 6 of 10 on page 21...
Scraping post 7 of

## Scraping Telstra posts

In [54]:
tls_posts = LinkedInPostScraper(
    70,
    'https://au.linkedin.com/organization-guest/api/feedUpdates/1636?paginationToken=0-1683170747332-d2cb91dc377a60f24f98652aa1b48db2'
)

tls_posts.scrape()

tls_posts_df = tls_posts.save_data('data/posts_linkedin/tls_posts_linkedin.csv')

Scraping post 1 of 10 on page 1...
Scraping post 2 of 10 on page 1...
Scraping post 3 of 10 on page 1...
Scraping post 4 of 10 on page 1...
Scraping post 5 of 10 on page 1...
Scraping post 6 of 10 on page 1...
Scraping post 7 of 10 on page 1...
Scraping post 8 of 10 on page 1...
Scraping post 9 of 10 on page 1...
Scraping post 10 of 10 on page 1...
Sleeping for 15 seconds...
Scraping post 1 of 10 on page 2...
Scraping post 2 of 10 on page 2...
Scraping post 3 of 10 on page 2...
Scraping post 4 of 10 on page 2...
Scraping post 5 of 10 on page 2...
Scraping post 6 of 10 on page 2...
Scraping post 7 of 10 on page 2...
Scraping post 8 of 10 on page 2...
Scraping post 9 of 10 on page 2...
Scraping post 10 of 10 on page 2...
Sleeping for 15 seconds...
Scraping post 1 of 10 on page 3...
Scraping post 2 of 10 on page 3...
Scraping post 3 of 10 on page 3...
Scraping post 4 of 10 on page 3...
Scraping post 5 of 10 on page 3...
Scraping post 6 of 10 on page 3...
Scraping post 7 of 10 on page 3...

## Scraping BHP posts

In [59]:
bhp_posts = LinkedInPostScraper(
    70,
    'https://au.linkedin.com/organization-guest/api/feedUpdates/4509?paginationToken=0-1683171487220-230ee5aec2bb676e90406661fbf7f75c'
)

bhp_posts.scrape()

bhp_posts_df = bhp_posts.save_data('data/posts_linkedin/bhp_posts_linkedin.csv')

Scraping post 1 of 10 on page 1...
Scraping post 2 of 10 on page 1...
Scraping post 3 of 10 on page 1...
Scraping post 4 of 10 on page 1...
Scraping post 5 of 10 on page 1...
Scraping post 6 of 10 on page 1...
Scraping post 7 of 10 on page 1...
Scraping post 8 of 10 on page 1...
Scraping post 9 of 10 on page 1...
Scraping post 10 of 10 on page 1...
Sleeping for 15 seconds...
Scraping post 1 of 10 on page 2...
Scraping post 2 of 10 on page 2...
Scraping post 3 of 10 on page 2...
Scraping post 4 of 10 on page 2...
Scraping post 5 of 10 on page 2...
Scraping post 6 of 10 on page 2...
Scraping post 7 of 10 on page 2...
Scraping post 8 of 10 on page 2...
Scraping post 9 of 10 on page 2...
Scraping post 10 of 10 on page 2...
Sleeping for 15 seconds...
Scraping post 1 of 10 on page 3...
Scraping post 2 of 10 on page 3...
Scraping post 3 of 10 on page 3...
Scraping post 4 of 10 on page 3...
Scraping post 5 of 10 on page 3...
Scraping post 6 of 10 on page 3...
Scraping post 7 of 10 on page 3...

## Scraping CSL posts

In [62]:
csl_posts = LinkedInPostScraper(
    70,
    'https://au.linkedin.com/organization-guest/api/feedUpdates/7384?paginationToken=0-1683171857393-645f77d2a456e4ab39beb71977fe2245'
)

csl_posts.scrape()

csl_posts_df = csl_posts.save_data('data/posts_linkedin/csl_posts_linkedin.csv')

Scraping post 1 of 10 on page 1...
Scraping post 2 of 10 on page 1...
Scraping post 3 of 10 on page 1...
Scraping post 4 of 10 on page 1...
Scraping post 5 of 10 on page 1...
Scraping post 6 of 10 on page 1...
Scraping post 7 of 10 on page 1...
Scraping post 8 of 10 on page 1...
Scraping post 9 of 10 on page 1...
Scraping post 10 of 10 on page 1...
Sleeping for 15 seconds...
Scraping post 1 of 10 on page 2...
Scraping post 2 of 10 on page 2...
Scraping post 3 of 10 on page 2...
Scraping post 4 of 10 on page 2...
Scraping post 5 of 10 on page 2...
Scraping post 6 of 10 on page 2...
Scraping post 7 of 10 on page 2...
Scraping post 8 of 10 on page 2...
Scraping post 9 of 10 on page 2...
Scraping post 10 of 10 on page 2...
Sleeping for 15 seconds...
Scraping post 1 of 10 on page 3...
Scraping post 2 of 10 on page 3...
Scraping post 3 of 10 on page 3...
Scraping post 4 of 10 on page 3...
Scraping post 5 of 10 on page 3...
Scraping post 6 of 10 on page 3...
Scraping post 7 of 10 on page 3...

# All data

In [64]:
all_data = [cba_posts_df, wow_posts_df, tls_posts_df, bhp_posts_df, csl_posts_df]

names = ['Commonwealth Bank', 'Woolworths Group', 'Telstra', 'BHP', 'CSL']

for i in range(len(all_data)):
    all_data[i].insert(0, 'company', names[i])
    

In [68]:
all_data_df = pd.concat(all_data, ignore_index=True)

check_null_rows(all_data_df)

Unnamed: 0,company,how_long_ago,text
16,Commonwealth Bank,,
22,Commonwealth Bank,,
27,Commonwealth Bank,,
30,Commonwealth Bank,,
34,Commonwealth Bank,,
39,Commonwealth Bank,,
50,Commonwealth Bank,,
55,Commonwealth Bank,,
145,Telstra,,
147,Telstra,,


In [69]:
all_data_df.dropna(inplace=True)
all_data_df.shape

(333, 3)

In [70]:
all_data_df.to_csv('data/posts_linkedin/all_posts_linkedin.csv', index=False, encoding='utf-8')