In [1]:
import requests
import bs4 as bs
import pandas as pd
import black

# URL to scrape
url = "https://www.geeksforgeeks.org/python-exercises-practice-questions-and-solutions/"

In [2]:
def get_soup(url):
    with requests.get(url) as r:
        soup = bs.BeautifulSoup(r.text, features='html.parser')
    return soup

In [3]:
def get_titles_and_links(url):
    soup = get_soup(url)
    tags = soup.find_all("h2")
    tags = tags[1:]
    
    title = []
    link = []
    
    for tag in tags:
        title.append(tag.text)
        link.append(tag.a["href"])
    
    df = pd.DataFrame({"Title": title, "Link": link})
    return df

In [4]:
def get_exercise(link):
    soup = get_soup(link)
    links = soup.find_all("div", class_="text")
    
    des = []
    ref = []
    
    for i in links[0].find_all("a"):
        des.append(i.text)
        ref.append(i["href"])
    
    df = pd.DataFrame({"Description": des, "Reference": ref})
    return df

In [5]:
def input_output_formatter(examples:list[str]):
    """
    format example of the code adn return sample of inputs & outputs\n
    return inputs, outputs
    """
    inputs = []
    outputs = []
    for example in examples:
        if example == '':
            continue
        if example.lower().startswith('input'):
            inputs.append(example)
        elif example.lower().startswith('output'):
            outputs.append(example)
        else:
            outputs.append(example)
    return inputs, outputs

In [6]:
def ultimate_line_formatter(code_soup:bs.BeautifulSoup):
    code_soup_lines = code_soup.find_all('div', 'line')
    code_lines = [x.text.replace('\xa0', ' ') for x in code_soup_lines]
    return '\n'.join(code_lines)   

In [7]:
def get_code(link: str):
    try:
        soup = get_soup(link)
        all_example = [x for x in soup.find_all('pre') if x.text != '']
        example = all_example[0] if all_example else None
        all_code = soup.find_all('div', 'code-container')
    except AttributeError:
        return [], {'inputs': [], 'outputs': []}

    formatted_code_str = []
    for code in all_code:
        try:
            out = ultimate_line_formatter(code)
            formatted_code = black.format_str(out, mode=black.FileMode()).strip()
        except Exception:
            formatted_code = code.text.strip()
        formatted_code_str.append(formatted_code)

    formatted_test_case = {'inputs': [], 'outputs': []}
    if example:
        inputs, outputs = input_output_formatter(example.text.split('\n'))
        for idx, _output in enumerate(outputs):
            formatted_test_case['inputs'].append(inputs[idx] if idx < len(inputs) else "From code")
            formatted_test_case['outputs'].append(_output)

    return formatted_code_str, formatted_test_case

In [8]:
def create_df(exercise_df: str, idx: int):
    link = exercise_df["Reference"][idx]
    code, test_case = get_code(link)
    data = []
    for c in code:
        data.append({"Code": c, "Test_Case": test_case["inputs"][0] + "\n" + test_case["outputs"][0]})

    df = pd.DataFrame(data)
    df["Description"] = exercise_df["Description"][idx]
    df["Link"] = exercise_df["Reference"][idx]
    df = df[["Description", "Link", "Code", "Test_Case"]]
    return df

In [9]:
Titles = get_titles_and_links(url)
exercise_data = []
for idx in range(len(Titles)):
    try:
        exercise_python = get_exercise(Titles["Link"][idx])
        for idx2 in range(len(exercise_python)):
            try:
                df = create_df(exercise_python, idx2)
                exercise_data.append(df)
                print(f"Done: {idx2+1}/{len(exercise_python)}")
            except Exception as e:
                print(f"Error processing exercise {idx2+1}: {str(e)}")
    except Exception as e:
        print(f"Error processing page {idx+1}: {str(e)}")

# Concatenate all exercise dataframes into a single dataframe
combined_df = pd.concat(exercise_data, ignore_index=True)

# Save the combined dataframe to a CSV file
combined_df.to_csv("Python_Exercises.csv", index=False)

Done: 1/75
Done: 2/75
Done: 3/75
Done: 4/75
Done: 5/75
Done: 6/75
Done: 7/75
Done: 8/75
Done: 9/75
Done: 10/75
Done: 11/75
Done: 12/75
Done: 13/75
Done: 14/75
Done: 15/75
Done: 16/75
Done: 17/75
Done: 18/75
Done: 19/75
Done: 20/75
Done: 21/75
Done: 22/75
Done: 23/75
Done: 24/75
Done: 25/75
Done: 26/75
Done: 27/75
Done: 28/75
Done: 29/75
Done: 30/75
Error processing exercise 31: "['Code', 'Test_Case'] not in index"
Done: 32/75
Done: 33/75
Done: 34/75
Done: 35/75
Done: 36/75
Done: 37/75
Done: 38/75
Error processing exercise 39: "['Code', 'Test_Case'] not in index"
Done: 40/75
Done: 41/75
Done: 42/75
Done: 43/75
Done: 44/75
Done: 45/75
Done: 46/75
Done: 47/75
Done: 48/75
Done: 49/75
Done: 50/75
Done: 51/75
Done: 52/75
Error processing exercise 53: list index out of range
Error processing exercise 54: "['Code', 'Test_Case'] not in index"
Done: 55/75
Done: 56/75
Done: 57/75
Done: 58/75
Done: 59/75
Done: 60/75
Done: 61/75
Done: 62/75
Done: 63/75
Done: 64/75
Done: 65/75
Done: 66/75
Done: 67/7