# Collection of Webscraping Projects - Olga Czechowicz

## Project 1

Creating a dictionary that holds information about random Wikipedia articles, using Wikipedia API.

Furthermore, the final dictionary will have the accumulated sum of page views of a given article in the last 60 days.

In [None]:
## Import module requests
import requests as rq  ## noqa

## Some page ids
page_ids = [
    19969580,
    39982842,
    25699035,
    52642931,
    53055349,
    24133565,
    1164662,
    40656459,
    12533026,
    47110862,
]

## API URL
BASE_URL = "https://en.wikipedia.org/w/api.php"

In [None]:
page_ids_string = "|".join(str(p) for p in page_ids)

# Below, I defined the query string parameters to pass to the 'get' request:
payload = {
      "action": "query", # the action is a query
      "prop" : "pageviews", # to specify the exact method for extracting which exact property
      "pageids" : page_ids_string, # the flattened string with all page ids
      "pageviews" : "pageviews", # although this argument is the default, I preferred to explicitly define this parameter for caution's and clarity's sake
      "pvipdays" : 60, # although 60 is the default, I preferred to explicitly define this parameter for caution's and clarity's sake
      "format" : "json"}

# Now, we can send the 'get' request to the Wikipedia API:
response = rq.get(BASE_URL, params=payload)

print(response)
# Response[200] was printed - meaning the request was succesful.

# Below, I extract the data from the json file in the response into a dictionary:
data = response.json()

In [None]:
pages = data['query']['pages']

pages['19969580']['pageviews'].values()
# This returns a list that can be even summed easily,
# NOTE HOWEVER, some values are NONE, and I noticed that when I tried to run the following code to sum the page views:
# sum(article_dict['19969580']['pageviews'].values())

# I made this following modification, that was basically a quick list comprehension to only account for non-None values
# set with if a is not None condition.
sum(a for a in pages['19969580']['pageviews'].values() if a is not None)

# In this step I located the page view values for the article, and figured a way of summing them up.


In [None]:
# Creating an empty 'results' dictionary, that would hold the page id and number of page views in total over last 60 days,
# using a for loop.

results = dict()

for page in pages:
    total_views = sum(value for value in pages[page]['pageviews'].values() if value is not None)
    results[int(page)] = total_views

print(results)
results
# When printed, or shown in console, we can see the dictionary was succesfully created.

## Project 2
Gathering 20 random Wikipedia articles that have the names "Olivia" and "Noah" in their title, and then counting the frequency of possesive pronouns ("his", "her" and "their") in these articles.


In [None]:
## Import module requests
import requests as rq  ## noqa

URL = "https://en.wikipedia.org/w/api.php"

In [None]:
# PART 1: Downloading the 20 articles - 10 with 'Olivia" and 10 with "Noah" in the title.

names = ["Olivia", "Noah"]
data = {}
page_ids = []

for name in names:
    payload = {
    "action": "query",
    "list": "prefixsearch",
    "pssearch" : name,
    "pslimit" : 10,
    "format": "json",
        }

    response = rq.get(URL, params=payload)
    print(response) # to confirm whether it was successful - both times 200 was printed, so indeed it was
    data[name] = response.json()
    page_ids += [page['pageid'] for page in data[name]['query']['prefixsearch']]



In [None]:
# Exploring the structure to find where the page ids are nested:

data.keys()
# The keys are names of interest - so let's try 'Olivia'
data['Olivia'].keys()
# Accessing the 'query' key
data['Olivia']['query'].keys()
# Accessing the 'prefixsearch'
data['Olivia']['query']['prefixsearch']
# Evaluating the first element
data['Olivia']['query']['prefixsearch'][0].keys()
# Accessing and finding the crucial value in the 'pageid' key
data['Olivia']['query']['prefixsearch'][0]['pageid']


In [None]:
# PART 2: Downloading the articles.

page_ids_string = "|".join(str(p) for p in page_ids)

payload = {
    "action": "query",
    "prop": "cirrusdoc",
    "pageids": page_ids_string,
    "format": "json",
  }

response = rq.get(URL, params=payload)

data = response.json()

print(response) # Request was successful

In [None]:
# Locating the text in the dictionary structure for later use:

pages = data["query"]["pages"]
# a more simplified version of our data

articles = [p["cirrusdoc"][0]["source"]["text"] for p in pages.values()]
# a list, where each element is a text of each article of interest


In [None]:
# PART 3: "Cleaning" words from text.

def extract_lower_words(string):
    """
    The function splits a string into a list of words consisting of only letter characters
    in the lowercase format. It takes any string, makes it into lowercase,
    then creates a list, where each element consists of characters
    that are letters, that are continuously added to a placeholder string,
    until a non-letter character is encountered. It returns a list of words that appeared
    in the string, without any non-letter characters attached.

    Args:
        string (string): Any string.

    Returns:
        extracted_words (list): A list, where each element is each word from
        the string, without any non-letter characters attached in lowercase.

    """
    string_lower = string.lower()

    current_word = ""
    extracted_words = []
    for char in string_lower:
        if char.isalpha():
            current_word += char
        else:
            if current_word:
                extracted_words.append(current_word)
                current_word = ""

    return extracted_words

In [None]:
# PART 4: Counting possesive pronouns for a list of specific articles.

pronoun_count = []
pronouns = ["his", "her", "their"]

for article in articles:
    clean_article = extract_lower_words(article)
    article_pronouns = {pronoun: 0 for pronoun in pronouns}

    for clean_word in clean_article:
        if clean_word in pronouns:
          article_pronouns[clean_word] += 1

    pronoun_count.append(article_pronouns)


In [None]:
print(pronoun_count)
pronoun_count

# As we can see, a list consisting of a dictionary counter, where 'his', 'her' and 'their' were counted
# in their appearance in each article was created.

## Project 3
Gathering information on a list of news articles from the Parliament and the Guardian through webscraping.

In [None]:
# Importing the necessary libraries

import requests
import json
import time
from string import punctuation
from bs4 import BeautifulSoup
from datetime import datetime


In [None]:
def article_len(article_text):
    """
    Takes a string representing the text of an article, then splits the string text into
    separate words, and lastly counts only the actual words in the text (elements that are not punctuation).

    Parameters:
    -----------
        article_text (str): a string representing the text of an article.

    Returns:
    --------
        len_article (int) : a numeric value representing the amount of words in the article.
    """

    words = [word for word in article_text.split() if word not in punctuation] # the article string is split, then list comprehension is used to create a list consisting of only elements in the article that are not in the 'punctuation' module from the string library
    len_article = len(words) # lastly this word only list is counted and returned

    return len_article


In [None]:
def date_convert(date_string):
    """
    Takes the date of publishing of an article from 'the Guardian' or 'the Parliament' (formatted as "%Y-%m-%dT%H:%M:%S.%fZ', '%d %b %Y" respectively),
    and formats it to match the following date format: "%d.%m.%Y" (e.g. 01.01.2022).

    Parameters:
    -----------
        date_string (str): a string representing the date of publishing of an article in 'the Guardian' or
        'the Parliament' format ("%Y-%m-%dT%H:%M:%S.%fZ', '%d %b %Y" respectively).

    Returns:
    --------
        (str) : a string representing the date of publishing of the article in the day.month.year format ("%d.%m.%Y").
    """

    date_formats = ['%Y-%m-%dT%H:%M:%S.%fZ', '%d %b %Y'] # Here are the possible formats of the input string

    # Now this function can go over the possible formats:

    for date_format in date_formats:
        try:
            date_object = datetime.strptime(date_string, date_format) # and try to match the input date_string to one of them
            return date_object.strftime("%d.%m.%Y") # in order to then format this object with now a known/matched format - into a different one using strftime().

        except ValueError:
            continue

In [None]:
# Reading the jsonl file into the colab environment.

with open("links.jsonl", "r") as file:
    links = [json.loads(item) for item in file.readlines()]

In [None]:
# Guardian scraper

def guardian_scraper(link):
    """
    Takes a link of an article from 'theguardian.com', and then scrapes the web-page from the link. Then, creates
    a dictionary that holds specific scraped data from the link: title of the article, author name,
    date of publishing, the summary/lead of the article and the entire text of the article itself.

    Parameters:
    -----------
        link (str): a string representing the link to a specific 'the Guardian' article.

    Returns:
    --------
        article_info (dict) : a dictionary containing scraped data about the provided link of an article:
                              title of the article, author name, date of publishing, the summary/lead of article
                              and the entire text of the article itself.
    """

    if "theguardian.com" in link.lower(): # Checks if provided link is from theguardian.com

        # Requesting the html data from a specific link:
        response = requests.get(link)
        html = BeautifulSoup(response.content, 'html.parser')

        # Creating an empty dictionary that will hold the html sourced
        # data about each 'the Guardian' article.
        article_info = dict()

        # Each key-value pair is established seperately for clarity.

        article_info['title'] = html.select_one('div[style*="--grid-area:headline;"] h1').text.strip()

        author_element = html.select_one("div.dcr-1cfpnlw").text.strip()
        # below is the accounting for possibility of no author existing:
        article_info['author'] = author_element if author_element else None

        # I decided to source information about the date of publishing through this meta tag
        # as the information sourced from it was exactly the same as the datatime in the string
        # below the header, BUT was more convienent when trying to convert into the data format
        # that was shown in the example dictionary.
        article_info['date'] = date_convert(html.select_one('meta[property="article:published_time"]').get('content'))

        # The summary/lead of article
        article_info['lead'] = html.select_one('div[style*="--grid-area:standfirst;"] p').text.strip()

        # The actual text of article - which is a joined list of each scraped paragraph using the html selector,
        # Furthermore, not only was the text stripped using strip(), but I also used replace() to delete all \n newlines
        # which were unfortunately attached to some words, and ruined the readability of the text.
        article_info['content'] = " ".join([item.text.strip().replace('\n', '') for item in html.select('div[id="maincontent"] p')])

        return article_info

    else:
         print("Article is not from theguardian.com. Function cannot run.")
         # Message that is printed to the user if article is not congruent with the function.


In [None]:
# Parliament scraper

def parliament_scraper(link):
    """
    Takes a link of an article from 'theparliamentmagazine.eu', and then scrapes the web-page from the link. Then, creates
    a dictionary that holds specific scraped data from the link: title of the article, author name,
    date of publishing, the summary/lead of the article and the entire text of the article itself.

    Parameters:
    -----------
        link (str): a  string representing the link to a specific 'the Parliament' article.

    Returns:
    --------
        article_info (dict) : a dictionary containing scraped data about the provided link of an article:
                              title of the article, author name, date of publishing, the summary/lead of article
                              and the entire text of the article itself.
    """

    if "theparliamentmagazine.eu" in link.lower(): # Checks if provided link is from theparliamentmagazine.eu

        # Requesting the html data from a specific link:
        response = requests.get(link)
        html = BeautifulSoup(response.content, 'html.parser')

        # Creating an empty dictionary that will hold the html sourced
        # data about each 'the Guardian' article.
        article_info = dict()

        # The title of the article - I wish to once again note that the div tag is provided for context and readability
        # especially if a reader/myself would want to cross check this in the future with the html structure.
        article_info['title'] = html.select_one("div.av-title h1").text.strip()

        # The author name + accounting for posibility of no author:
        author_element = html.select_one("div.av-authInfo a").text.strip()
        article_info['author'] = author_element if author_element else None

        # The date - this time sourced from the headline string, not meta data, as there were two types of dates here:
        # one for publishing and other for last modification - for clarity I simply went with the field where there was only one date provided.
        date_element = html.select_one("p.av-date").text.strip()
        article_info['date'] = date_convert(date_element)

        # The summary/lead of the article:
        article_info['lead'] = html.select_one("div.av-title div").text.strip()

        # The actual text of the article,
        # again newlines were accounted for,
        # a new endeavor here was to only include paragraphs which were "direct children" of this specific div tag,
        # and not all types of paragraphs at this level (otherwise paragraphs not exclusive to the article like "Read more on X" were included),
        # and paragraphs that did not have a "strong" class - these were also non-article text related paragraphs.

        article_info['content'] = " ".join([paragraph.text.strip().replace('\n', '') for paragraph in html.select("div.av-main > p") if not paragraph.find("strong")])

        return article_info

    else:
        print("Article is not from theparliamentmagazine.eu. Function cannot run.")


In [None]:
articles_scraped = []

for link in links:
    if "theguardian.com" in link['url'].lower(): # For 'the Guardian' articles
        link_info = guardian_scraper(link['url'])

    else: # For 'the Parliament' articles
        link_info = parliament_scraper(link['url'])

    # Now adding the data from the original jsonl file following the provided example:
    link_info['source'] = link['source_name']
    link_info['fb'] = {
        'likes' : link['fb']['likes'],
        'shares' : link['fb']['shares'],
        'comments' : link['fb']['comments']
          }
    link_info['length'] = article_len(link_info['content'])

    articles_scraped.append(link_info)
    time.sleep(1) # To not overwhelm the requests.

In [None]:
# Writing the list object into a single jsonl file

with open("scraped_article_info.jsonl", "w", encoding = "utf-8") as file:
    for line in articles_scraped:
        file.write(json.dumps(line, ensure_ascii = False) + "\n")

## Project 4
Gathering 100 submissions from the todayilearned subreddit, which include the word "science".

Then saving 10 submissions with the biggest number of comments, along with some information about them into a json file.

In [None]:
# Pip installing and importing the praw library necessary later.

!pip install praw
import praw

# Assigning the secret user related environmental variables to named objects.

from google.colab import userdata

client_id = userdata.get("client_id")
client_secret = userdata.get("client_secret")
password = userdata.get("password")
user_agent = userdata.get("user_agent")
username = userdata.get("username")

In [None]:
# Requesting specific submissions from r/todayilearned

reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    password=password,
    user_agent=user_agent,
    username=username,
    check_for_async=False,)

submissions = [item for item in reddit.subreddit("todayilearned").search(query = "science", sort = "comments", limit = 100)]


In [None]:
# A function to convert UTC into human readable format as provided in class.

from datetime import datetime

def convert_date(date_float: float) -> str:
    """
    Takes a date in epoch time format and converts it into a string in human-readable date format.

    Parameters:
    -----------
        date_float (float): a float representing a date in epoch time format.

    Returns:
    --------
        (str) : a string representing a date in human-readable format.
    """
    return datetime.fromtimestamp(date_float).strftime("%d-%m-%Y %H:%M:%S")

In [None]:
# Creating a list of dictionaries for the 10 items with highest no of comments
# which will be later uploaded into a json.

output = [{
            key : value for key, value in {
              'author_name' : item.author.name,
              'created_utc' : convert_date(item.created_utc), # for the date in readable format - convert_date()
              'title' : item.title,
              'num_comments' : item.num_comments,
              'url' : item.url
                }.items() if value is not None
           } for item in submissions[:10]
          ]

In [None]:
# Writing the list object into a single jsonl file

with open("reddit_posts_info.jsonl", "w", encoding = "utf-8") as file:
    for line in output:
        file.write(json.dumps(line, ensure_ascii = False) + "\n")