In [None]:
import requests
import json

def get_course_details(course_slug):
    """
    Gets detailed information for a specific course using its slug.

    Args:
        course_slug (str): The course slug (from the URL)

    Returns:
        dict: Detailed course information
    """
    url = "https://www.coursera.org/graphql-gateway"
    operation_name = "CDPPageQuery"

    # Headers to mimic browser request
    headers = {
        "Content-Type": "application/json",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "application/json",
        "Origin": "https://www.coursera.org",
        "Referer": f"https://www.coursera.org/learn/{course_slug}"
    }

    # GraphQL query based on the provided schema
    query = """
    query CDPPageQuery($slug: String!) {
      XdpV1Resource {
        slug(productType: "COURSE", slug: $slug) {
          elements {
            name
            id
            slug
            xdpMetadata {
              ... on XdpV1_cdpMetadataMember {
                cdpMetadata {
                  id
                  avgLearningHoursAdjusted
                  level
                  certificates
                  courseStatus
                  domains {
                    domainId
                    domainName
                    subdomainName
                    subdomainId
                  }
                  primaryLanguages
                  skills
                  photoUrl
                  name
                  slug
                  description
                  workload
                  partners {
                    id
                    name
                    shortName
                    logo
                  }
                  instructors {
                    id
                    fullName
                    photo
                    title
                  }
                  ratings {
                    averageFiveStarRating
                    ratingCount
                    commentCount
                  }
                }
              }
            }
          }
        }
      }
    }
    """

    # Request body
    payload = {
        "operationName": operation_name,
        "variables": {"slug": course_slug},
        "query": query
    }

    # Make the request
    response = requests.post(url, headers=headers, json=payload)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

def extract_basic_info(response_data):
    """Extract and print basic information from the API response"""
    try:
        # Navigate through the response structure
        elements = response_data["data"]["XdpV1Resource"]["slug"]["elements"]
        if not elements:
            print("No course found")
            return

        course = elements[0]
        metadata = course["xdpMetadata"]["cdpMetadata"]

        # Extract basic information
        info = {
            "name": metadata["name"],
            "description": metadata["description"][:100] + "..." if metadata["description"] else "N/A",
            "level": metadata["level"],
            "workload": metadata["workload"],
            "skills": ", ".join(metadata["skills"][:5]) + "..." if len(metadata["skills"]) > 5 else ", ".join(metadata["skills"]),
            "partners": [p["name"] for p in metadata["partners"]],
            "instructors": [i["fullName"] for i in metadata["instructors"]],
            "rating": metadata["ratings"]["averageFiveStarRating"] if "ratings" in metadata else "N/A",
            "ratingCount": metadata["ratings"]["ratingCount"] if "ratings" in metadata else "N/A"
        }

        # Print formatted information
        print("\n---- COURSE DETAILS ----")
        print(f"Name: {info['name']}")
        print(f"Level: {info['level']}")
        print(f"Workload: {info['workload']}")
        print(f"Partners: {', '.join(info['partners'])}")
        print(f"Instructors: {', '.join(info['instructors'])}")
        print(f"Rating: {info['rating']} ({info['ratingCount']} ratings)")
        print(f"Skills: {info['skills']}")
        print(f"Description: {info['description']}")

        return info
    except Exception as e:
        print(f"Error extracting information: {e}")
        return None

def main():
    # Example course slugs to test
    test_courses = [
        "machine-learning",  # Andrew Ng's Machine Learning course
        "python",  # Python for Everybody
        "deep-learning-specialization"  # Deep Learning Specialization
    ]

    print("Coursera API Test - Course Details\n")

    for slug in test_courses:
        print(f"\nFetching details for course: {slug}")
        response = get_course_details(slug)

        if response:
            extract_basic_info(response)

            # Optionally save the full response to a JSON file
            with open(f"{slug}_details.json", "w") as f:
                json.dump(response, f, indent=2)
            print(f"Full details saved to {slug}_details.json")
        else:
            print(f"Failed to get details for {slug}")

if __name__ == "__main__":
    main()

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
import requests
import json
import pandas as pd
import os
from datetime import datetime
import time

def extract_coursera_courses(query_params=None, limit=20, max_retries=3):
    """
    Extract course data from Coursera's API with improved error handling and debugging
    """
    # Coursera has multiple potential endpoints we can try
    endpoints = [
        "https://www.coursera.org/api/catalogResults.v2",    # Standard catalog results API
        "https://www.coursera.org/api/courses.v1",           # Courses API (backup)
        "https://www.coursera.org/api/browse/courses"        # Browse courses API (backup)
    ]

    # More complete headers to mimic browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Origin": "https://www.coursera.org",
        "Referer": "https://www.coursera.org/courses",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin"
    }

    # Base query parameters
    params = {
        "start": 0,
        "limit": limit,
        "query": "",
        "sort": "relevance",
        "fields": "name,slug,description,partnerIds,partners.v1(name),skills,workload,rating,certificates"
    }

    if query_params:
        params.update(query_params)

    # Try each endpoint with retries
    for endpoint in endpoints:
        for attempt in range(max_retries):
            try:
                print(f"Trying endpoint {endpoint} (attempt {attempt+1}/{max_retries})...")
                response = requests.get(endpoint, headers=headers, params=params)

                print(f"Response status code: {response.status_code}")

                if response.status_code != 200:
                    print(f"Error: Received status code {response.status_code}")
                    if attempt < max_retries - 1:
                        print(f"Retrying in {2 ** attempt} seconds...")
                        time.sleep(2 ** attempt)  # Exponential backoff
                        continue
                    else:
                        print(f"Failed after {max_retries} attempts with endpoint {endpoint}")
                        break

                # Debug info: Print the first 100 characters of the response
                print(f"Response preview: {response.text[:100]}...")

                # Parse the JSON
                data = response.json()

                # Extract course data from the response
                courses = []

                # Different APIs have different response structures, handle each case
                if "elements" in data:
                    # Standard catalog API structure
                    for element in data.get("elements", []):
                        courses.append({
                            "id": element.get("id", ""),
                            "name": element.get("name", ""),
                            "slug": element.get("slug", ""),
                            "description": element.get("description", ""),
                            "partnerNames": [p.get("name", "") for p in element.get("partners", [])],
                            "skills": [s.get("name", "") for s in element.get("skills", [])],
                            "avgLearningHours": element.get("workload", ""),
                            "rating": element.get("rating", "")
                        })
                elif "linked" in data and "courses.v1" in data.get("linked", {}):
                    # Alternative API structure
                    for course in data["linked"]["courses.v1"]:
                        courses.append({
                            "id": course.get("id", ""),
                            "name": course.get("name", ""),
                            "slug": course.get("slug", ""),
                            "description": course.get("description", ""),
                            "partnerNames": course.get("partnerNames", []),
                            "skills": course.get("topicIds", []),  # May need transformation
                            "avgLearningHours": course.get("workload", ""),
                            "rating": course.get("rating", "")
                        })

                if courses:
                    print(f"Successfully extracted {len(courses)} courses")
                    return courses
                else:
                    print("No courses found in response data structure")
                    print(f"Response keys: {list(data.keys())}")
                    if attempt < max_retries - 1:
                        print(f"Retrying in {2 ** attempt} seconds...")
                        time.sleep(2 ** attempt)
                    else:
                        break

            except requests.exceptions.RequestException as e:
                print(f"Network error: {e}")
                if attempt < max_retries - 1:
                    print(f"Retrying in {2 ** attempt} seconds...")
                    time.sleep(2 ** attempt)
                else:
                    print(f"Failed after {max_retries} attempts with endpoint {endpoint}")
            except json.JSONDecodeError as e:
                print(f"JSON parsing error: {e}")
                print(f"Raw response (first 200 chars): {response.text[:200]}")
                if attempt < max_retries - 1:
                    print(f"Retrying in {2 ** attempt} seconds...")
                    time.sleep(2 ** attempt)
                else:
                    print(f"Failed after {max_retries} attempts with endpoint {endpoint}")

    # If we've tried all endpoints and still failed, try a completely different approach
    print("All API endpoints failed. Trying to scrape course data from HTML...")

    try:
        # Fallback to getting course data from the browse page
        browse_url = "https://www.coursera.org/browse/data-science"

        print(f"Fetching HTML from {browse_url}...")
        response = requests.get(browse_url, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
        })

        if response.status_code == 200:
            print("Successfully fetched course browse page")

            # Look for JSON data in the HTML (common pattern in modern websites)
            html = response.text

            # A common pattern is to have a script tag with JSON data
            import re
            json_matches = re.findall(r'<script id="initialState" type="application/json">(.*?)</script>', html)

            if json_matches:
                print("Found initialState JSON data in the page")
                try:
                    initial_data = json.loads(json_matches[0])

                    # Extract courses from the initial state data
                    # The exact path will depend on the structure; this is a common pattern
                    courses = []

                    if "browse" in initial_data and "courses" in initial_data["browse"]:
                        raw_courses = initial_data["browse"]["courses"]
                        for course_id, course_data in raw_courses.items():
                            courses.append({
                                "id": course_id,
                                "name": course_data.get("name", ""),
                                "slug": course_data.get("slug", ""),
                                "description": course_data.get("description", ""),
                                "partnerNames": [p.get("name", "") for p in course_data.get("partners", [])],
                                "skills": course_data.get("skills", []),
                                "avgLearningHours": course_data.get("workload", ""),
                                "rating": course_data.get("rating", "")
                            })

                    if courses:
                        print(f"Successfully extracted {len(courses)} courses from HTML")
                        return courses
                    else:
                        print("Could not find course data in the initialState JSON")
                except json.JSONDecodeError as e:
                    print(f"Error parsing JSON from HTML: {e}")
            else:
                print("Could not find initialState JSON in the HTML")
        else:
            print(f"Failed to fetch HTML: Status code {response.status_code}")

    except Exception as e:
        print(f"Error in HTML fallback method: {e}")

    # If all methods fail, return an empty list
    print("All extraction methods failed. Could not retrieve course data.")
    return []

def convert_to_csv(courses):
    """
    Convert course data to a pandas DataFrame and save as CSV
    """
    if not courses:
        print("No courses to convert to CSV")
        return None

    # Normalize data and handle nested fields
    processed_courses = []

    for course in courses:
        course_data = {
            "id": course.get("id", ""),
            "name": course.get("name", ""),
            "slug": course.get("slug", ""),
            "description": course.get("description", "").replace("\n", " ").replace("\r", " "),  # Remove line breaks
            "learning_hours": course.get("avgLearningHours", ""),
            "partners": ", ".join(course.get("partnerNames", [])),
            "skills": ", ".join(course.get("skills", [])),
            "rating": course.get("rating", "")
        }
        processed_courses.append(course_data)

    # Create DataFrame
    df = pd.DataFrame(processed_courses)

    # Generate a timestamp for the filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_path = f"coursera_courses_{timestamp}.csv"

    # Save as CSV
    df.to_csv(file_path, index=False, encoding='utf-8')
    print(f"Data saved to {file_path}")

    return file_path

# Manual testing functionality
if __name__ == "__main__":
    print("Testing course extraction...")
    courses = extract_coursera_courses(limit=10)

    if courses:
        print(f"\nSuccessfully extracted {len(courses)} courses")
        print("\nSample of first course:")
        print(json.dumps(courses[0], indent=2))

        # Convert to CSV
        csv_path = convert_to_csv(courses)
        print(f"\nSaved to CSV: {csv_path}")
    else:
        print("Failed to extract any courses")

In [None]:
#find limit
!  curl "https://api.coursera.org/api/courses.v1?start=1&limit=11?includes=instructorIds,partnerIds,specializations,s12nlds,v1Details,v2Details&fields=instructorIds,partnerIds,specializations,s12nlds,description"


In [None]:
url = "https://api.coursera.org/api/courses.v1?start=0&limit=2150&includes=instructorIds,partnerIds,specializations,s12nlds,v1Details,v2Details&fields=instructorIds,partnerIds,specializations,s12nlds,description"
data = requests.get(url).json()
print(len(data['elements']))

In [None]:
import os
import time
from google.cloud import storage
import mimetypes

def upload_to_gcs(local_file_path, bucket_name, destination_blob_name=None,
                  make_public=False, content_type=None, metadata=None):
    """
    Uploads a file to a Google Cloud Storage bucket
    """
    # If destination blob name is not specified, use the file name
    if destination_blob_name is None:
        destination_blob_name = os.path.basename(local_file_path)

    # Detect content type if not specified
    if content_type is None:
        content_type, _ = mimetypes.guess_type(local_file_path)
        if content_type is None:
            content_type = 'application/octet-stream'

    try:
        # Initialize the client
        storage_client = storage.Client()

        # Get the bucket
        bucket = storage_client.bucket(bucket_name)

        # Create a blob and upload the file
        blob = bucket.blob(destination_blob_name)

        # Set content type and metadata
        blob.content_type = content_type
        if metadata:
            blob.metadata = metadata

        # Upload the file
        blob.upload_from_filename(local_file_path)

        # Make public if requested
        if make_public:
            blob.make_public()
            url = blob.public_url
        else:
            url = None

        gs_path = f"gs://{bucket_name}/{destination_blob_name}"
        print(f"File {local_file_path} uploaded to {gs_path}")

        return {
            'url': url,
            'path': gs_path,
            'bucket': bucket_name,
            'blob': destination_blob_name
        }

    except Exception as e:
        print(f"Error uploading to Google Cloud Storage: {e}")
        return None

def ensure_bucket_exists(bucket_name):
    """Create the bucket if it doesn't exist and grant access"""
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)

        if not bucket.exists():
            print(f"Creating bucket {bucket_name}...")
            bucket = storage_client.create_bucket(bucket_name, location="us-central1")
            print(f"Bucket {bucket_name} created")
        else:
            print(f"Bucket {bucket_name} already exists")

        # Set IAM policy to grant access to the required user
        policy = bucket.get_iam_policy()

        # Check if the binding already exists to avoid duplicates
        user_has_access = False
        for binding in policy.bindings:
            if binding.get("role") == "roles/storage.objectViewer" and "user:ranchana.kiriyapong@gmail.com" in binding.get("members", []):
                user_has_access = True
                break

        if not user_has_access:
            policy.bindings.append({
                "role": "roles/storage.objectViewer",
                "members": ["user:ranchana.kiriyapong@gmail.com"]
            })
            bucket.set_iam_policy(policy)
            print(f"Access granted to ranchana.kiriyapong@gmail.com")
        else:
            print(f"ranchana.kiriyapong@gmail.com already has access")

        return True

    except Exception as e:
        print(f"Error with bucket operations: {e}")
        return False

def main():
    # Set your GCS bucket name - use the one that's already working
    bucket_name = "servicemyassaccount"

    print("Starting course data extraction...")

    # Extract a small batch first to test
    print("Extracting a test batch of courses...")
    test_courses = extract_coursera_courses(limit=2150)

    if not test_courses:
        print("Failed to extract test batch of courses. Exiting.")
        return

    print(f"Successfully extracted {len(test_courses)} courses in test batch")

    # If test is successful, extract more courses
    all_courses = test_courses

    # Convert to CSV
    print("Converting courses to CSV...")
    csv_path = convert_to_csv(all_courses)

    if not csv_path:
        print("Failed to create CSV file. Exiting.")
        return

    # Upload to Google Cloud Storage
    print(f"Uploading {csv_path} to Google Cloud Storage...")
    result = upload_to_gcs(
        local_file_path=csv_path,
        bucket_name=bucket_name,
        make_public=True,
        metadata={
            "source": "coursera_api",
            "extraction_date": os.path.basename(csv_path).split("_")[1],
            "record_count": str(len(all_courses))
        }
    )

    if result:
        print(f"File uploaded successfully!")
        print(f"GCS Path: {result['path']}")
        if result.get('url'):
            print(f"Public URL: {result['url']}")
    else:
        print("Failed to upload file to GCS")

    print("\nAssignment completed successfully!")

if __name__ == "__main__":
    main()