## Medium.com Article Scraper with API

This scraper utilizes a private API by rapidapi.com, and scrapes medium.com articles with given category tags. It filters for non-membership articles and applies a mean article read time criteria to disqualify short articles with just a few sentences. It also removes personal information such as email addresses, and then saves the article content in a json format.

 ### Importing Necessary Libraries

In [None]:
import http.client
import json
import requests
import os
import re

### API Configuration

In [None]:
rapidapi_key0 = os.getenv("rapidapi_key0")
rapidapi_key1 = os.getenv("rapidapi_key1")
rapidapi_key2 = os.getenv("rapidapi_key2")
rapidapi_key3 = os.getenv("rapidapi_key3")
rapidapi_key4 = os.getenv("rapidapi_key4")
rapidapi_host = "medium2.p.rapidapi.com"

### Extracting Article IDs Through the API

In [None]:
related_tags = ['artificial-intelligence', 'machine-learning', 'deep-learning']

# Create an empty list to store article ID's
medium_article_ids = []

# Loop through the tags to get related ID's
for tag in related_tags:
    conn = http.client.HTTPSConnection("medium2.p.rapidapi.com")
    headers = {
        'X-RapidAPI-Key': rapidapi_key0,
        'X-RapidAPI-Host': rapidapi_host
    }
    # Create the URL for the tag search
    url = f'/search/articles?query={tag}'
    conn.request("GET", url, headers=headers)
    res = conn.getresponse()
    data = res.read()
    data_json = json.loads(data.decode("utf-8"))
    
    # Attach the acquired ID's to the list
    articles_list = data_json.get('articles', [])
    medium_article_ids.extend(articles_list)

### Removing Duplicates and Counting Unique Article IDs

In [None]:
# Calculate the original count of article IDs
original_count = len(medium_article_ids)

# Create a set to store unique article IDs, effectively removing duplicates
unique_article_ids = set(medium_article_ids)

# Calculate the count of unique article IDs
unique_count = len(unique_article_ids)

# Calculate the count of dropped (duplicate) article IDs
dropped_count = original_count - unique_count

print(f"Original count: {original_count}")
print(f"Unique count: {unique_count}")
print(f"Dropped count: {dropped_count}")

# Save the data as a JSON file
with open("medium_article_ids.json", "w", encoding="utf-8") as json_file:
    json.dump(medium_article_ids, json_file, ensure_ascii=False, indent=4)


### Extracting Article Information (Reading Time and Member-only Status)

In [None]:
import json
import requests

# List of API keys
api_keys = [rapidapi_key1, rapidapi_key2, rapidapi_key3]

# Number of articles to get for each API key
articles_per_key = 150

# Create a starting point for the keys
start_index = 0

# Create a list to store article information for all API keys
all_article_information = []

# Open the JSON file for writing within the 'with' statement
with open("article_information.json", "w", encoding="utf-8") as json_file:
    
    # Call different 
    for api_key in api_keys:
        # Create counters for each API key
        calls_processed = 0
        calls_failed = 0

        # Loop through the article IDs for the current API key
        for i in range(start_index, start_index + articles_per_key):
            if i >= len(medium_article_ids):
                break  # Stop if we've fetched 150 articles with a single key

            article_id = medium_article_ids[i]
            url = f"https://medium2.p.rapidapi.com/article/{article_id}"

            headers = {
                "X-RapidAPI-Key": api_key,
                "X-RapidAPI-Host": rapidapi_host
            }

            response = requests.get(url, headers=headers)

            if response.status_code == 200:
                article_data = response.json()
                reading_time = article_data.get("reading_time", "N/A")
                is_lock = article_data.get("is_locked")

                article_info = {
                    "article_id": article_id, 
                    "reading_time": reading_time, 
                    "member-only": is_lock
                }
                    
                all_article_information.append(article_info)  # Append to the list

                # Print calls_processed within the loop
                print(f"API Key {api_key}: Processed {calls_processed} articles")

                calls_processed += 1
            else:
                print(f"Failed to fetch article {article_id} with API key {api_key}. Status code: {response.status_code}")
                calls_failed += 1

        print(f"API Key {api_key}: Processed {calls_processed} articles, Failed to fetch {calls_failed} articles")

        # Update the starting point for the next key
        start_index += articles_per_key

# Save the combined data as a single JSON file
with open("article_information.json", "w", encoding="utf-8") as json_file:
    json.dump(all_article_information, json_file, indent=4)

print(f"Total processed: {len(all_article_information)} articles")




### Filtering the Article Information

In [None]:
# Read the data in
with open("article_information.json", "r", encoding="utf-8") as json_file:
    medium_articles_info = json.load(json_file)

total_reading_time = 0

# For loop to retrieve reading time and save it to total reading time object
for article in medium_articles_info:
    reading_time = article.get('reading_time', 0)
    total_reading_time += float(reading_time)

# Calculate the mean reading time
mean_reading_time = total_reading_time / len(medium_articles_info) if len(medium_articles_info) > 0 else 0

print(f"Mean Reading Time: {mean_reading_time} minutes")

# Filter out articles with reading time lesser than the mean
filtered_articles = [article for article in medium_articles_info if float(article["reading_time"]) > mean_reading_time and not article["member-only"]]

# Print filtered articles
print("Filtered Articles:")
for article in filtered_articles:
    print(f"Article ID: {article['article_id']}, Reading Time: {article['reading_time']} minutes, "
          f"Members-only: {article['member-only']}")

# Get the amount of articles after filtering
print(len(filtered_articles))

# Save the data as a JSON file
with open("filtered_article_info.json", "w") as json_file:
    json.dump(filtered_articles, json_file)


### Extracting Article Content

In [None]:
# Read the data from a JSON file containing filtered article information
with open("filtered_article_info.json", "r", encoding="utf-8") as json_file:
    filtered_articles = json.load(json_file)

# Initialize an empty list to store article content
article_contents = []

# Open the JSON file for writing the article content
with open("article_contents.json", "w", encoding="utf-8") as content_json_file:
    # Loop through the filtered articles
    for article_info in filtered_articles:
        article_id = article_info['article_id']

        # Construct the URL to fetch the article content
        url = f'https://medium2.p.rapidapi.com/article/{article_id}/content'

        # Set the headers for the HTTP request, including the API key and host
        headers = {
            "X-RapidAPI-Key": rapidapi_key0,
            "X-RapidAPI-Host": rapidapi_host
        }

        # Make an HTTP GET request to retrieve the article content
        response = requests.get(url, headers=headers)

        # Check if the response status code is 200 (successful request)
        if response.status_code == 200:
            # Parse the JSON response to extract the article content
            article_data = response.json()
            article_content = article_data.get('content')

            # Use a regular expression pattern to remove email addresses from the article content
            pattern = r'\S+@\S+'
            article_content = re.sub(pattern, '', article_content)

            # Create a dictionary with article ID and its content, and add it to the article_contents list
            article_content_all = {
                "article_id": article_id,
                "article_content": article_content
            }
            article_contents.append(article_content_all)

            # Write the info to the JSON file
            content_json_file.write(json.dumps(article_content_all, indent=4))
            content_json_file.write('\n')
        else:
            # Print an error message for failed requests
            print(f"Failed to fetch article {article_id}. Status code: {response.status_code}")

# Create a dictionary to map each article ID to its content
article_data_dict = {article_info['article_id']: article_info['article_content'] for article_info in article_contents}

# Save the dictionary as a JSON file 
with open("article_contents.json", "w", encoding="utf-8") as json_file:
    json.dump(article_data_dict, json_file, indent=2)


In [None]:
# Print the list to see content
print(article_data_dict)