In [5]:
import requests
import pandas as pd
import time
import random
import json
import re
import os
from google.colab import userdata
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

In [6]:
userdata = userdata.get('github')

os.environ['GITHUB_TOKEN'] = userdata

In [7]:
!git clone https://{userdata}@github.com/miguroi/sistech.git

Cloning into 'sistech'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [8]:
!git status

fatal: not a git repository (or any of the parent directories): .git


In [10]:
!ls

sample_data  sistech


In [12]:
!cd sistech

/bin/bash: line 1: cd: sistech: No such file or directory


In [14]:
!ls

README.md


In [58]:
!git add .

In [59]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mnew file:   coursera_courses.csv[m



## Configuration & Setup

In [48]:
DELAY_RANGE = (1, 3)
DIFFICULTY_LEVELS = ['Beginner', 'Intermediate', 'Advanced', 'Mixed']
GRAPHQL_ENDPOINT = "https://www.coursera.org/graphql"
COURSES_PER_REQUEST = 12

SAMPLE_NUMBER = {
    'Beginner': 600,
    'Intermediate': 500,
    'Advanced': 400,
    'Mixed': 250
}

session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Content-Type': 'application/json',
    'Accept': 'application/json',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Origin': 'https://www.coursera.org',
    'Referer': 'https://www.coursera.org/courses',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin'
})

## Utility Functions

In [49]:
def rate_limit():
    """Rate limiting with random delay"""
    delay = random.uniform(*DELAY_RANGE)
    time.sleep(delay)

## GraphQL Request Functions

In [50]:
def build_graphql_payload(difficulty_level, cursor="0"):
    """Build GraphQL payload for course search"""
    return [{
        "operationName": "Search",
        "variables": {
            "requests": [{
                "entityType": "PRODUCTS",
                "limit": COURSES_PER_REQUEST,
                "facets": ["topic", "skills", "productDifficultyLevel", "productDuration",
                          "productTypeDescription", "partners", "language"],
                "sortBy": "BEST_MATCH",
                "maxValuesPerFacet": 1000,
                "facetFilters": [[f"productDifficultyLevel:{difficulty_level}"]],
                "cursor": cursor,
                "query": ""
            }]
        },
        "query": """
        query Search($requests: [Search_Request!]!) {
          SearchResult {
            search(requests: $requests) {
              elements {
                ...SearchHit
                __typename
              }
              pagination {
                cursor
                totalElements
                __typename
              }
              totalPages
              __typename
            }
            __typename
          }
        }

        fragment SearchHit on Search_Hit {
          ...SearchProductHit
          __typename
        }

        fragment SearchProductHit on Search_ProductHit {
          avgProductRating
          duration
          id
          imageUrl
          isCourseFree
          name
          numProductRatings
          partners
          productCard {
            id
            canonicalType
            marketingProductType
            __typename
          }
          productDifficultyLevel
          productDuration
          productType
          skills
          url
          tagline
          __typename
        }
        """
    }]

def make_graphql_request(difficulty_level, cursor="0"):
    """Make GraphQL request to Coursera API"""
    payload = build_graphql_payload(difficulty_level, cursor)

    try:
        rate_limit()
        response = session.post(GRAPHQL_ENDPOINT, json=payload, timeout=15)

        if response.status_code != 200:
            print(f"    Request failed with status: {response.status_code}")
            return {'courses': [], 'pagination': {'cursor': None, 'totalElements': 0}, 'total_pages': 0}

        data = response.json()

        if not data or 'data' not in data[0]:
            print(f"    Invalid response structure")
            return {'courses': [], 'pagination': {'cursor': None, 'totalElements': 0}, 'total_pages': 0}

        search_result = data[0]['data']['SearchResult']['search'][0]

        return {
            'courses': search_result['elements'],
            'pagination': search_result['pagination'],
            'total_pages': search_result.get('totalPages', 1)
        }

    except Exception as e:
        print(f"    GraphQL request error: {e}")
        return {'courses': [], 'pagination': {'cursor': None, 'totalElements': 0}, 'total_pages': 0}

## Data Extraction Functions

In [51]:
def extract_course_from_graphql(course_data):
    """Extract course data from GraphQL response"""
    try:
        title = course_data.get('name', '')
        organization = ', '.join(course_data.get('partners', []))
        rating = course_data.get('avgProductRating')
        review_count = course_data.get('numProductRatings')
        skills = ', '.join(course_data.get('skills', []))
        duration = course_data.get('productDuration') or course_data.get('duration')
        difficulty = course_data.get('productDifficultyLevel', '')
        course_type = course_data.get('productType', '')
        url = course_data.get('url', '')
        is_free = course_data.get('isCourseFree', False)

        # Extract duration from tagline if not available
        if not duration:
            tagline = course_data.get('tagline', '')
            if tagline:
                duration_patterns = [
                    r'(\d+\s*-\s*\d+\s*(?:week|month|hour|day)s?)',
                    r'(\d+\s*(?:week|month|hour|day)s?)',
                ]

                for pattern in duration_patterns:
                    match = re.search(pattern, tagline, re.IGNORECASE)
                    if match:
                        duration = match.group(1)
                        break

        return {
            'title': title,
            'organization': organization,
            'rating': rating,
            'review_count': review_count,
            'difficulty': difficulty,
            'course_type': course_type,
            'duration': duration,
            'skills': skills,
            'url': url,
            'is_free': is_free,
            'course_id': course_data.get('id', '')
        }

    except Exception as e:
        print(f"    Error extracting course data: {e}")
        return None

## Data Preprocessing Functions

In [52]:
def preprocess_courses(courses):
    """Clean and standardize course data"""
    if not courses:
        return courses

    df = pd.DataFrame(courses)

    if not df.empty:
        # Clean organization names
        df['organization'] = df['organization'].str.strip()

        # Standardize difficulty levels
        df['difficulty'] = df['difficulty'].str.title()

        # Clean course types
        df['course_type'] = df['course_type'].str.replace('_', ' ').str.title()

        # Clean skills
        df['skills'] = df['skills'].str.strip()
        df['skills'] = df['skills'].replace('', None)

        # Convert rating to float
        df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
        df['review_count'] = pd.to_numeric(df['review_count'], errors='coerce')

        return df.to_dict('records')

    return courses

## Web Scraping Functions

In [53]:
def scrape_difficulty_level_graphql(difficulty_level, max_courses=None):
    """Scrape courses for a difficulty level using GraphQL"""
    print(f"Scraping {difficulty_level} courses...")

    if max_courses is None:
        max_courses = SAMPLE_NUMBER.get(difficulty_level, 500)

    all_courses = []
    cursor = "0"
    request_count = 0

    while len(all_courses) < max_courses:
        # Make GraphQL request
        result = make_graphql_request(difficulty_level, cursor)

        if not result['courses']:
            print(f"  No more courses found for {difficulty_level}")
            break

        # Extract course data
        batch_courses = []
        for course_data in result['courses']:
            course = extract_course_from_graphql(course_data)
            if course:
                batch_courses.append(course)

                if len(all_courses) + len(batch_courses) >= max_courses:
                    batch_courses = batch_courses[:max_courses - len(all_courses)]
                    break

        # Preprocess batch
        processed_courses = preprocess_courses(batch_courses)
        all_courses.extend(processed_courses)

        request_count += 1
        total_elements = result['pagination']['totalElements']

        print(f"  {difficulty_level} - Request {request_count}: {len(batch_courses)} courses | Total: {len(all_courses)}/{max_courses} | Available: {total_elements}")

        # Check if we should continue
        next_cursor = result['pagination']['cursor']
        if not next_cursor or next_cursor == cursor:
            print(f"  Reached end of results for {difficulty_level}")
            break

        cursor = next_cursor

    print(f"  {difficulty_level}: {len(all_courses)} courses collected")
    return all_courses

## Main Scraping Function

In [54]:
def scrape_coursera(levels=None, max_requests_per_level=None, delay_range=(1, 3)):
    """
    Scrape Coursera courses using GraphQL API

    Args:
        levels: List of difficulty levels to scrape (default: all)
        max_requests_per_level: Max GraphQL requests per level (None = all available)
        delay_range: Tuple of (min, max) delay seconds (default: (1, 3))
    """
    global DELAY_RANGE
    DELAY_RANGE = delay_range

    if levels is None:
        levels = DIFFICULTY_LEVELS

    print("Starting Coursera scraping via GraphQL API...")
    start_time = time.time()

    all_courses = []
    for level in levels:
        level_courses = scrape_difficulty_level_graphql(level, max_requests_per_level)
        all_courses.extend(level_courses)

    df = pd.DataFrame(all_courses)
    if df.empty:
        print("No courses found")
        return df

    # Remove duplicates
    original_count = len(df)
    df = df.drop_duplicates(subset=['course_id'], keep='first').reset_index(drop=True)

    elapsed = time.time() - start_time

    # Results summary
    print(f"\nResults:")
    print(f"  Total courses: {len(df)}")
    print(f"  Duplicates removed: {original_count - len(df)}")
    print(f"  Time: {elapsed/60:.1f} minutes")
    print(f"  With ratings: {df['rating'].notna().sum()}")
    print(f"  With skills: {df['skills'].notna().sum()}")
    print(f"  Organizations: {df['organization'].nunique()}")
    print(f"  Free courses: {df['is_free'].sum()}")

    # Difficulty breakdown
    print(f"\nDifficulty breakdown:")
    difficulty_counts = df['difficulty'].value_counts()
    for diff, count in difficulty_counts.items():
        print(f"  {diff}: {count}")

    # Skills sample
    skills_sample = df[df['skills'].notna()]['skills'].head(3).tolist()
    if skills_sample:
        print(f"\nSkills sample: {skills_sample[0][:60]}...")

    # Save to CSV
    df.to_csv('coursera_courses.csv', index=False)
    print(f"\nSaved: coursera_courses.csv")

    return df

# Execution

In [55]:
if __name__ == "__main__":
    df = scrape_coursera()

Starting Coursera scraping via GraphQL API...
Scraping Beginner courses...
  Beginner - Request 1: 12 courses | Total: 12/600 | Available: 8801
  Beginner - Request 2: 12 courses | Total: 24/600 | Available: 8801
  Beginner - Request 3: 12 courses | Total: 36/600 | Available: 8801
  Beginner - Request 4: 12 courses | Total: 48/600 | Available: 8801
  Beginner - Request 5: 12 courses | Total: 60/600 | Available: 8801
  Beginner - Request 6: 12 courses | Total: 72/600 | Available: 8801
  Beginner - Request 7: 12 courses | Total: 84/600 | Available: 8801
  Beginner - Request 8: 12 courses | Total: 96/600 | Available: 8801
  Beginner - Request 9: 12 courses | Total: 108/600 | Available: 8801
  Beginner - Request 10: 12 courses | Total: 120/600 | Available: 8801
  Beginner - Request 11: 12 courses | Total: 132/600 | Available: 8801
  Beginner - Request 12: 12 courses | Total: 144/600 | Available: 8801
  Beginner - Request 13: 12 courses | Total: 156/600 | Available: 8801
  Beginner - Reques

In [56]:
coursera_courses = pd.read_csv('coursera_courses.csv')
coursera_courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1750 entries, 0 to 1749
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1750 non-null   object 
 1   organization  1750 non-null   object 
 2   rating        1709 non-null   float64
 3   review_count  1750 non-null   int64  
 4   difficulty    1750 non-null   object 
 5   course_type   1750 non-null   object 
 6   duration      1750 non-null   object 
 7   skills        1749 non-null   object 
 8   url           1750 non-null   object 
 9   is_free       1750 non-null   bool   
 10  course_id     1750 non-null   object 
dtypes: bool(1), float64(1), int64(1), object(8)
memory usage: 138.6+ KB


In [57]:
coursera_courses.head()

Unnamed: 0,title,organization,rating,review_count,difficulty,course_type,duration,skills,url,is_free,course_id
0,Google Data Analytics,Google,4.763478,170221,Beginner,Professional Certificate,THREE_TO_SIX_MONTHS,"Data Storytelling, Data Literacy, Data Visuali...",/professional-certificates/google-data-analytics,False,s12n~kr43OcbTEeqeNBKhfgCLyw
1,Google Cybersecurity,Google,4.822994,53223,Beginner,Professional Certificate,THREE_TO_SIX_MONTHS,"Threat Modeling, Network Security, Incident Re...",/professional-certificates/google-cybersecurity,False,s12n~Dy6K-2UKEe2PIRJn6nL9pQ
2,Google Project Management:,Google,4.842595,130640,Beginner,Professional Certificate,THREE_TO_SIX_MONTHS,"Quality Management, Project Management Life Cy...",/professional-certificates/google-project-mana...,False,s12n~fq9UWMbTEeqpthJ2RmWGow
3,Google AI Essentials,Google,4.837584,2666,Beginner,Specialization,ONE_TO_THREE_MONTHS,"Prompt Engineering, Generative AI, Artificial ...",/specializations/ai-essentials-google,False,s12n~3tzIujTqTk-YdcEFZ9r3sQ
4,Google Digital Marketing & E-commerce,Google,4.801475,40598,Beginner,Professional Certificate,THREE_TO_SIX_MONTHS,"Data Storytelling, Search Engine Marketing, Me...",/professional-certificates/google-digital-mark...,False,s12n~aYYrIEl-EeyCjQ5Y8Mzdsw
