In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import random
import re
import json
from tqdm import tqdm

Article URLs

In [None]:
#Configuration --
base_url = "https://tr.khanacademy.org"
start_url = "https://tr.khanacademy.org/science/biology"
output_filename = "data/khanacademy_bio_articles_urls.json"

options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--window-size=1920,1080")
options.add_argument("--log-level=3") 

driver = webdriver.Chrome(options=options)

data = {}

article_pattern = re.compile(r"/a/")

try:
    for class_num in range(1, 2):    
        print(f"Accessing Main Page: {start_url}")
        driver.get(start_url)
        
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        time.sleep(random.randint(2, 4))

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        #Find all Unit URLs
        unit_links = []
        raw_links = soup.find_all('a', href=True)
        
        for a in raw_links:
            href = a['href']
            if "/science/biology/" in href and href != "/science/biology" and "javascript" not in href:
                full_unit_url = base_url + href if href.startswith("/") else href
                if full_unit_url not in unit_links:
                    unit_links.append(full_unit_url)

        # Initialize list for this class
        class_data = []

        # Process each Unit
        for unit_number, unit_url in enumerate(tqdm(unit_links, desc=f"Scraping Class {class_num}"), start=1):
            driver.get(unit_url)
            time.sleep(random.randint(1, 3)) # Sleep to be polite and let JS load
            
            unit_soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Find Article URLs in this Unit
            found_urls = []
            
            # Find all links on the page
            all_anchors = unit_soup.find_all("a", href=True)
            
            for anchor in all_anchors:
                href = anchor['href']
                # Check if it is an article link (/a/) and not a video link (/v/)
                if "/a/" in href:
                    full_url = base_url + href if href.startswith("/") else href
                    
                    # Avoid duplicates and ensure it's a valid link
                    if full_url not in found_urls:
                        found_urls.append(full_url)
            
            if not found_urls:
                class_data.append({
                    "unit_number": unit_number,
                    "non_video_urls": None
                })
            else:
                class_data.append({
                    "unit_number": unit_number,
                    "non_video_urls": found_urls
                })

        data[f"class_{class_num}"] = class_data

finally:
    driver.quit()

# Save to JSON
with open(f"./data/{output_filename}", 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

print(f"Data has been successfully saved to {output_filename}")

Video URLs

In [None]:
# Configuration
base_url = "https://tr.khanacademy.org"
start_url = "https://tr.khanacademy.org/science/biology"
output_filename = "data/khanacademy_bio_vids_urls.json"

options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--window-size=1920,1080")
options.add_argument("--log-level=3") 

driver = webdriver.Chrome(options=options)

data = {}

try:
    for class_num in range(1, 2):    
        print(f"Accessing Main Page: {start_url}")
        driver.get(start_url)

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        time.sleep(random.randint(2, 4))

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all Unit URLs
        unit_links = []
        raw_links = soup.find_all('a', href=True)
        
        for a in raw_links:
            href = a['href']
            if "/science/biology/" in href and href != "/science/biology" and "javascript" not in href:
                full_unit_url = base_url + href if href.startswith("/") else href
                if full_unit_url not in unit_links:
                    unit_links.append(full_unit_url)

        # Initialize list for this class
        class_data = []

        print(f"Found {len(unit_links)} units.")

        # Process each Unit
        for unit_number, unit_url in enumerate(tqdm(unit_links, desc=f"Scraping Class {class_num}"), start=1):
            driver.get(unit_url)
            
            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            except:
                print(f"Timeout loading {unit_url}")
                
            time.sleep(random.randint(1, 3)) 

            unit_soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Find Video URLs in this Unit
            unit_video_links = []
            
            all_anchors = unit_soup.find_all("a", href=True)
            
            for anchor in all_anchors:
                href = anchor['href']
                
                # Check for video link pattern /v/
                if "/v/" in href:
                    full_url = base_url + href if href.startswith("/") else href
                    
                    # Avoid duplicates
                    if full_url not in unit_video_links:
                        unit_video_links.append(full_url)
            
            if not unit_video_links:
                class_data.append({
                    "unit_number": unit_number,
                    "video_number": 0,
                    "video_urls": None
                })
            else:
                class_data.append({
                    "unit_number": unit_number,
                    "video_number": len(unit_video_links),
                    "video_urls": unit_video_links
                })

        data[f"class_{class_num}"] = class_data

finally:
    driver.quit()

# Save the data to a JSON file
with open(output_filename, 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

print(f"Data has been successfully saved to {output_filename}")

Article Data

In [None]:
def clean_text(text):
    text = text.replace('\xa0', ' ')
    text = re.sub(r'[^\S\r\n]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = text.replace("Created with Raphaël", "").strip()
    return text

def remove_duplicates_preserve_order(input_list):
    seen = set()
    result = []
    for item in input_list:
        cont = False
        for result_item in result:
            if item in result_item:
                cont = True
                break
        if cont:
            continue
        if item not in seen:
            result.append(item)
            seen.add(item)
    return result

def add_space_after_punctuation(text):
    pattern = r'([.,!?;:])(?=\S)'
    corrected_text = re.sub(pattern, r'\1 ', text)
    return corrected_text

def split_camel_case(text):
    split_text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    return split_text

def read_json(file_path):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(options=options)
    
    results = []

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Initialize tqdm progress bar with a fixed total of 126 URLs
    with tqdm(total=126, desc="Processing URLs") as pbar:
        for class_name, units in data.items():
            for unit in units:
                unit_number = unit["unit_number"]
                non_video_urls = unit.get("non_video_urls")

                if non_video_urls:
                    for url in non_video_urls:
                        try:
                            driver.get(url)
                            WebDriverWait(driver, 10).until(
                                EC.presence_of_element_located((By.CLASS_NAME, '_1ge0o9y3'))
                            )
                            soup = BeautifulSoup(driver.page_source, 'html.parser')

                            titles = soup.find_all('h1', class_='_1ge0o9y3')
                            print("title", titles)
                            title_text = ""
                            for title in titles:
                                title_text = title.get_text(strip=True)

                            # Wait until the parent element with class '_1h1mqh3' is present
                            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1h1mqh3')))

                            parent_element = soup.find('div', class_='_1h1mqh3')

                            content_list = []

                            if parent_element:
                                sub_elements = parent_element.find_all(class_='paragraph')
                                
                                for sub_element in sub_elements:
                                    text = clean_text(sub_element.get_text())
                                    if text and text not in content_list:
                                        content_list.append(text)

                            content_list = remove_duplicates_preserve_order(content_list)
                            full_contents = "\n".join(content_list)
                            full_contents = add_space_after_punctuation(full_contents)
                            full_contents = split_camel_case(full_contents)

                            results.append({
                                "url": url,
                                "title": title_text,
                                "contents": full_contents
                            })

                        except Exception as e:
                            print(f"Error processing {url}: {e}")
                            # Continue to the next URL if an error occurs
                            continue

                        # Update progress bar
                        pbar.update(1)

    driver.quit()

    fname = "data/biology_extracted_articles.json"

    # Write results to a new JSON file
    with open(fname, 'w', encoding='utf-8') as outfile:
        json.dump(results, outfile, indent=4, ensure_ascii=False)
    
    print(f"Contents successfully saved to {fname}")

file_path = 'data/khanacademy_bio_articles_urls.json'
read_json(file_path)

Video Transcripts

In [None]:
def get_total_link_number(file_path):

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    total = 0
    # Initialize tqdm progress bar with a fixed total of 126 URLs
    for _ , units in data.items():

        for unit in units:
            
            video_number = unit.get("video_number")

            total += video_number
            
    return total

def read_json(file_path):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(options=options)

    results = []

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)


    # Initialize tqdm progress bar with a fixed total of 126 URLs
    with tqdm(total=get_total_link_number(file_path), desc="Processing URLs") as pbar:
        for class_num , units in data.items():

            class_unit_data = []
            for unit in units:
                
                video_urls = unit.get("video_urls")
                unit_number = unit.get("unit_number")

                if video_urls:
                    for url in video_urls:
                        try:
                            driver.get(url)
                            time.sleep(random.randint(2, 3))

                            soup = BeautifulSoup(driver.page_source, 'html.parser')

                            # Wait until the titles with class '_qts7xbw' are present
                            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1ge0o9y3')))

                            titles = soup.find_all('h1', class_='_1ge0o9y3')
                            title_text = ""
                            for title in titles:
                                title_text = title.get_text(strip=True)

                            transcript_text_content = None
                            try:
                                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1fezbb8')))
                                transcript_parent = soup.find(class_="_1fezbb8")
                                transcript_text_content = transcript_parent.get_text()
                            except TimeoutException:
                                print("Element with class '_1fezbb8' not found within the given time.")

                            class_unit_data.append({
                                "url": url,
                                "title": title_text,
                                "transcript_content": transcript_text_content
                            })

                        except Exception as e:
                            print(f"Error processing {url}: {e}")
                            # Continue to the next URL if an error occurs
                            continue

                        # Update progress bar
                        pbar.update(1)

            results.append({
                "class_num": class_num,
                "contents": class_unit_data
            })
            
    driver.quit()

    fname = "data/biology_video_transcripts.json"

    # Write results to a new JSON file
    with open(fname, 'w', encoding='utf-8') as outfile:
        json.dump(results, outfile, indent=4, ensure_ascii=False)
    
    print(f"Contents successfully saved to {fname}")

file_path = 'data/khanacademy_bio_vids_urls.json'
read_json(file_path)


Generating Questions

In [None]:
from api import *

In [13]:
def create_video_prompt(text, title):
    prompt = f"""
    Biyoloji dersinde, verilen metnin başlığı '{title}'. Bu metin aslında bir video transkripti; 
    bu yüzden metni dikkatlice okuyup, konuya dair anlamlı, hatasız ve tekrarsız 4 soru üretmeniz bekleniyor.
    
    - Sorular yalnızca metinde açıkça belirtilen bilgilere dayanmalıdır.
    - Videoda gösterilen ancak metinde açıkça belirtilmeyen görsel veya figürlere dayalı sorulardan kaçının.
    - Eğer transkript sadece belirli bir örneği açıklıyorsa, o örneğe bağlı kalmadan genel bilgiler üzerine sorular üretin.
    - Soruların genel ve kavramsal bilgilere dayalı olması gerekmektedir; özel veya aşırı detaylı bilgilere dayanan sorulardan kaçınmalısınız.

    Örneğin, metinde: "Cebirsel ifadelerde ilk olarak eksi yedi ile sekiz çarpı x'in toplamı isteniyor. 
    Toplam dediği için burada toplama işlemi yapacağız."
    İyi soru örneği: "Cebirsel ifadelerde toplama işlemi nasıl yapılır?"
    Kötü soru örneği: "Cebirsel ifadelerde toplama işleminin nasıl yapıldığını anlatırken hangi örneği kullanılmıştır?" 
    Bu tür spesifik sorulardan kaçınmalısınız

    Metin: {text}
    
    - Sorularınızı aşağıdaki formatta oluşturun:
      İlk soru || İkinci soru || Üçüncü soru || Dördüncü soru
    
    Soruların kesinlikle belirtilen formatta olması gerekmektedir ve sorular dışında başka hiçbir şey yazılmamalıdır.
    """
    return prompt


In [None]:
def create_article_prompt(text, title, course):
    prompt = f"""
    {course} dersinde, verilen metnin başlığı '{title}'.
    Bu metni anlayıp, konuya dair anlamlı, doğru ve hatasız 10 soru üretmen bekleniyor.
    Metin: {text}
    
    - Soruları aşağıdaki formatta oluştur:
    üreteceğin ilk soru || üreteceğin ikinci soru || ...
    
    -Soruların kesinlikle belirtilen formatta olması gerekmektedir. soru hariç başka hiçbir şey yazmaman gerekmektedir,
    """
    return prompt


In [6]:
def get_questions(text):
    pattern = r'\s*([^|]+?)\s*(?=\s*\|\|)'

    questions = re.findall(pattern, text)
    questions = [q.strip() for q in questions if q.strip()]
    
    return questions

In [14]:
def get_quest_dict(data, prompt_function):
    answers = {}
    not_formatted = []
    client = connect_azure_gpt()
    for example in data:
        my_prompt = prompt_function(example['transcript_content'] ,example['title'] )
        answer = generate_answer(client, my_prompt , "")
        questions = get_questions(answer)
        if questions:
            answers[example['title']]= questions
        else:
            not_formatted.append(answer)
    return answers, not_formatted


In [None]:
file_name = "data/biology_video_transcripts.json"
with open(file_name, encoding='utf-8') as file:
    data = json.load(file)
    data = data[0]['contents']

video_questions, _ = get_quest_dict(data, create_video_prompt)

file_name = "data/biology_extracted_articles.json"
with open(file_name, encoding='utf-8') as file:
    data = json.load(file)

article_questions, _ = get_quest_dict(data, create_article_prompt)

all_questions = video_questions + article_questions

output_file = 'biology_questions.json'
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(all_questions, file, ensure_ascii=False, indent=4)