# 🇹🇳 Tunisian University Guide Comprehensive Scraper

This notebook implements a complete scraping solution for the Tunisian University Orientation website (https://guide-orientation.rnu.tn/).

## 🎯 Objectives
- Extract all combinations of:
  - **نوع الباكالوريا** (Baccalaureate types)
  - **الجامعة** (Universities) 
  - **المؤسسة** (Institutions)
  - **الشعبة/الإجازة** (Specializations/Degrees)

- Scrape detailed information from ramz popup pages including:
  - University and institution details
  - Program specifications
  - Historical admission scores
  - Entry requirements

## 📊 Expected Output
A comprehensive CSV dataset with ~3,161 specialization records containing all program details and statistics.

In [None]:
# Import Required Libraries
import requests
import asyncio
import aiohttp
import pandas as pd
from bs4 import BeautifulSoup
import json
import re
import time
from urllib.parse import urljoin, parse_qs, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
import os
from datetime import datetime
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully")

In [None]:
# Configuration and Data Structures
BASE_URL = "https://guide-orientation.rnu.tn"
MAX_WORKERS = 10  # Parallel processing workers
DELAY = 1.0       # Delay between requests (seconds)

# Ensure data directory exists
os.makedirs('../data', exist_ok=True)

@dataclass
class SpecializationDetail:
    """Data class for specialization details"""
    ramz_code: str
    ramz_url: str
    bac_type: str = ""
    university: str = ""
    governorate: str = ""
    institution: str = ""
    address: str = ""
    phone: str = ""
    specialization: str = ""
    training_field: str = ""
    specializations: str = ""
    measure: str = ""
    capacity_2025: str = ""
    requires_test: str = ""
    geographic_distribution: str = ""
    conditions: str = ""
    study_duration: str = ""
    last_oriented_score_2024: str = ""
    score_history: dict = None
    extraction_timestamp: str = ""

# Session configuration
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'ar,en;q=0.5',
    'Connection': 'keep-alive',
    'Referer': BASE_URL
})

print("✅ Configuration and data structures defined")

In [None]:
# Define Scraping Functions

def get_bac_types():
    """Extract baccalaureate types from main page"""
    print("🔍 Extracting baccalaureate types...")
    
    try:
        response = session.get(f"{BASE_URL}/index.php")
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Look for select elements with bac types
        bac_options = []
        selects = soup.find_all('select')
        
        for select in selects:
            options = select.find_all('option')
            for option in options:
                value = option.get('value')
                text = option.get_text().strip()
                if value and text and value != '0' and 'إختر' not in text:
                    bac_options.append({'value': value, 'text': text})
        
        # Remove duplicates
        unique_options = []
        seen_values = set()
        for option in bac_options:
            if option['value'] not in seen_values:
                unique_options.append(option)
                seen_values.add(option['value'])
        
        print(f"✅ Found {len(unique_options)} baccalaureate types")
        return unique_options
        
    except Exception as e:
        print(f"❌ Error extracting bac types: {e}")
        return []

def get_ramz_links_for_bac(bac_value, bac_text):
    """Get all ramz links for a specific baccalaureate type"""
    print(f"🔗 Getting ramz links for {bac_text}...")
    
    try:
        # First get the main page to understand the form structure
        main_response = session.get(f"{BASE_URL}/index.php")
        main_response.raise_for_status()
        main_soup = BeautifulSoup(main_response.content, 'html.parser')
        
        # Look for the actual form action
        form = main_soup.find('form')
        form_action = form.get('action') if form else None
        
        # Try different possible search URLs
        possible_urls = [
            f"{BASE_URL}/ar/dynamique/index_ar.php",
            f"{BASE_URL}/dynamique/index_ar.php", 
            f"{BASE_URL}/ar/index_ar.php",
            f"{BASE_URL}/index_ar.php"
        ]
        
        if form_action:
            full_action = urljoin(BASE_URL, form_action)
            possible_urls.insert(0, full_action)
        
        # Try each URL until we find one that works
        for search_url in possible_urls:
            try:
                print(f"   Trying URL: {search_url}")
                
                # Try GET request first (some forms use GET)
                get_params = {
                    'nbac': bac_value,
                    'univ': '',
                    'inst': '',
                    'spec': ''
                }
                
                response = session.get(search_url, params=get_params)
                
                # If GET doesn't work, try POST
                if response.status_code != 200:
                    form_data = {
                        'nbac': bac_value,
                        'univ': '',
                        'inst': '', 
                        'spec': ''
                    }
                    response = session.post(search_url, data=form_data)
                
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    
                    # Save search results for debugging
                    with open(f'../data/search_debug_bac_{bac_value}.html', 'w', encoding='utf-8') as f:
                        f.write(soup.prettify())
                    
                    # Extract ramz links
                    ramz_links = extract_ramz_links_from_soup(soup, bac_value, bac_text)
                    
                    if ramz_links:
                        print(f"✅ Found {len(ramz_links)} ramz links for {bac_text}")
                        return ramz_links
                    else:
                        print(f"   No ramz links found in response from {search_url}")
                        
                else:
                    print(f"   HTTP {response.status_code} from {search_url}")
                    
            except Exception as url_error:
                print(f"   Error with {search_url}: {url_error}")
                continue
        
        print(f"❌ All URLs failed for {bac_text}")
        return []
        
    except Exception as e:
        print(f"❌ Error getting ramz links for {bac_text}: {e}")
        return []

def extract_ramz_links_from_soup(soup, bac_value, bac_text):
    """Extract ramz links from search results HTML"""
    ramz_links = []
    
    # Method 1: Find all links with PopupCentrer
    links = soup.find_all('a', href=re.compile(r'javascript:PopupCentrer'))
    
    for link in links:
        href = link.get('href')
        if 'filiere.php' in href:
            # Extract URL from JavaScript function
            url_match = re.search(r'"([^"]*filiere\.php[^"]*)"', href)
            if url_match:
                relative_url = url_match.group(1)
                full_url = urljoin(BASE_URL, relative_url)
                
                # Extract ramz code from URL
                parsed_url = urlparse(relative_url)
                query_params = parse_qs(parsed_url.query)
                ramz_id = query_params.get('id', [None])[0]
                
                if ramz_id:
                    # Extract ramz code (remove prefix if present)
                    ramz_code = re.sub(r'^\d+', '', ramz_id) if len(ramz_id) > 5 else ramz_id
                    
                    ramz_links.append({
                        'ramz_code': ramz_code,
                        'ramz_id': ramz_id,
                        'url': full_url,
                        'bac_value': bac_value,
                        'bac_text': bac_text
                    })
    
    # Method 2: Look in table cells for ramz codes
    tables = soup.find_all('table')
    for table in tables:
        rows = table.find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            for cell in cells:
                # Look for ramz code patterns in cell text
                cell_text = cell.get_text().strip()
                ramz_match = re.search(r'\b\d{5,6}\b', cell_text)
                
                if ramz_match:
                    ramz_code = ramz_match.group()
                    
                    # Look for associated link
                    cell_links = cell.find_all('a', href=re.compile(r'PopupCentrer'))
                    for cell_link in cell_links:
                        href = cell_link.get('href')
                        url_match = re.search(r'"([^"]*filiere\.php[^"]*)"', href)
                        if url_match:
                            relative_url = url_match.group(1)
                            full_url = urljoin(BASE_URL, relative_url)
                            
                            ramz_links.append({
                                'ramz_code': ramz_code,
                                'ramz_id': ramz_code,
                                'url': full_url,
                                'bac_value': bac_value,
                                'bac_text': bac_text
                            })
    
    # Method 3: Direct link extraction from href attributes
    all_links = soup.find_all('a', href=True)
    for link in all_links:
        href = link.get('href')
        if 'filiere.php' in href and 'id=' in href:
            # Direct link to filiere.php
            if href.startswith('http'):
                full_url = href
            else:
                full_url = urljoin(BASE_URL, href)
            
            # Extract ramz code
            parsed_url = urlparse(href)
            query_params = parse_qs(parsed_url.query)
            ramz_id = query_params.get('id', [None])[0]
            
            if ramz_id:
                ramz_code = re.sub(r'^\d+', '', ramz_id) if len(ramz_id) > 5 else ramz_id
                
                ramz_links.append({
                    'ramz_code': ramz_code,
                    'ramz_id': ramz_id,
                    'url': full_url,
                    'bac_value': bac_value,
                    'bac_text': bac_text
                })
    
    # Remove duplicates based on ramz_code
    unique_links = {}
    for link in ramz_links:
        key = link['ramz_code']
        if key not in unique_links:
            unique_links[key] = link
    
    return list(unique_links.values())

print("✅ Improved scraping functions defined")

In [None]:
# HTML Parsing Functions

def parse_ramz_detail_page(ramz_info, html_content):
    """Parse a ramz detail page HTML content"""
    ramz_code = ramz_info['ramz_code']
    url = ramz_info['url']
    bac_text = ramz_info['bac_text']
    
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Initialize detail object
        detail = SpecializationDetail(
            ramz_code=ramz_code,
            ramz_url=url,
            bac_type=bac_text,
            score_history={},
            extraction_timestamp=datetime.now().isoformat()
        )
        
        # Parse main data table
        table = soup.find('table', class_='table')
        if table:
            rows = table.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                if len(cells) >= 2:
                    label = cells[0].get_text().strip()
                    value_cell = cells[1]
                    value = value_cell.get_text().strip()
                    
                    # Extract data based on Arabic labels
                    if 'الجامعة' in label and value:
                        detail.university = value
                    elif 'الولاية' in label and value:
                        detail.governorate = value
                    elif 'المؤسسة' in label:
                        # Institution info might span multiple lines
                        institution_text = value_cell.get_text()
                        lines = [line.strip() for line in institution_text.split('\\n') if line.strip()]
                        
                        if lines:
                            detail.institution = lines[0]
                            
                            # Look for address and phone in remaining text
                            full_text = ' '.join(lines)
                            
                            address_match = re.search(r'العنوان\\s*:\\s*([^\\n\\r]+)', full_text)
                            if address_match:
                                detail.address = address_match.group(1).strip()
                            
                            phone_match = re.search(r'الهاتف\\s*:\\s*([^\\n\\r]+)', full_text)
                            if phone_match:
                                detail.phone = phone_match.group(1).strip()
                    
                    elif 'مجال التكوين' in label and value:
                        detail.training_field = value
                    elif 'الشعبة / الإجازة' in label and value:
                        detail.specialization = value
                    elif 'التخصصات' in label and value:
                        detail.specializations = value
                    elif 'المقياس' in label and value:
                        detail.measure = value
                    elif 'طاقة الإستعاب' in label and value:
                        detail.capacity_2025 = value
                    elif 'شعبة تتطلب إختبار' in label and value:
                        detail.requires_test = value
                    elif 'التنفيل الجغرافي' in label and value:
                        detail.geographic_distribution = value
                    elif 'الشروط' in label and value:
                        detail.conditions = value
                    elif 'مدة الدراسة' in label and value:
                        detail.study_duration = value
                    elif 'مجموع آخر موجه 2024' in label and value:
                        detail.last_oriented_score_2024 = value
        
        # Extract score history from JavaScript data
        scripts = soup.find_all('script')
        for script in scripts:
            if script.string:
                script_content = script.string
                
                # Look for score data patterns
                year_scores = re.findall(r'(20\\d{2})[^\\d]*(\\d+(?:\\.\\d+)?)', script_content)
                for year, score in year_scores:
                    if len(score) > 2:  # Likely a score
                        detail.score_history[year] = score
        
        return detail
        
    except Exception as e:
        print(f"❌ Error parsing ramz {ramz_code}: {e}")
        return None

async def fetch_html_content(session, ramz_info):
    """Async function to fetch HTML content for a ramz page"""
    try:
        async with session.get(ramz_info['url']) as response:
            if response.status == 200:
                html = await response.text()
                return ramz_info, html
            else:
                return ramz_info, None
    except Exception as e:
        print(f"Error fetching {ramz_info['ramz_code']}: {e}")
        return ramz_info, None

print("✅ HTML parsing functions defined")

In [None]:
# Debug: Analyze Website Structure

def analyze_website_structure():
    """Analyze the main website to understand its structure"""
    print("🔍 Analyzing website structure...")
    
    try:
        # Get main page
        response = session.get(f"{BASE_URL}/index.php")
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Save main page HTML
        with open('../data/main_page_debug.html', 'w', encoding='utf-8') as f:
            f.write(soup.prettify())
        
        print("✅ Main page saved to ../data/main_page_debug.html")
        
        # Analyze forms
        forms = soup.find_all('form')
        print(f"\\n📋 Found {len(forms)} form(s):")
        
        for i, form in enumerate(forms):
            action = form.get('action', 'No action')
            method = form.get('method', 'GET')
            print(f"   Form {i+1}: {method} -> {action}")
            
            # Analyze form inputs
            inputs = form.find_all(['input', 'select', 'button'])
            for inp in inputs:
                name = inp.get('name', 'No name')
                inp_type = inp.get('type', inp.name)
                print(f"      - {inp_type}: {name}")
                
                if inp.name == 'select':
                    options = inp.find_all('option')
                    print(f"        Options: {len(options)}")
                    for opt in options[:3]:  # Show first 3 options
                        value = opt.get('value', 'No value')
                        text = opt.get_text().strip()
                        print(f"          {value}: {text}")
                    if len(options) > 3:
                        print(f"          ... and {len(options) - 3} more")
        
        # Look for JavaScript that might handle the form
        scripts = soup.find_all('script')
        print(f"\\n🔗 Found {len(scripts)} script tags")
        
        for i, script in enumerate(scripts):
            if script.string and ('function' in script.string or 'submit' in script.string):
                print(f"   Script {i+1} contains functions/submit logic")
        
        # Look for any direct links to Arabic pages
        links = soup.find_all('a', href=True)
        arabic_links = [link for link in links if 'ar/' in link.get('href', '')]
        
        print(f"\\n🌐 Found {len(arabic_links)} links to Arabic pages:")
        for link in arabic_links[:5]:
            href = link.get('href')
            text = link.get_text().strip()[:50]
            print(f"   {href} - {text}")
        
        return soup
        
    except Exception as e:
        print(f"❌ Error analyzing website: {e}")
        return None

def test_direct_arabic_page():
    """Test accessing the Arabic version directly"""
    print("\\n🧪 Testing direct access to Arabic pages...")
    
    arabic_urls = [
        f"{BASE_URL}/ar/",
        f"{BASE_URL}/ar/index.php",
        f"{BASE_URL}/ar/dynamique/",
        f"{BASE_URL}/ar/dynamique/index.php"
    ]
    
    for url in arabic_urls:
        try:
            print(f"   Testing: {url}")
            response = session.get(url)
            print(f"      Status: {response.status_code}")
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                forms = soup.find_all('form')
                tables = soup.find_all('table')
                selects = soup.find_all('select')
                
                print(f"      Content: {len(forms)} forms, {len(tables)} tables, {len(selects)} selects")
                
                # Save successful page
                filename = url.replace('/', '_').replace(':', '').replace('.', '_') + '.html'
                with open(f'../data/test_{filename}', 'w', encoding='utf-8') as f:
                    f.write(soup.prettify())
                print(f"      Saved to: ../data/test_{filename}")
                
        except Exception as e:
            print(f"      Error: {e}")

# Run the analysis
print("🔍 WEBSITE STRUCTURE ANALYSIS")
print("=" * 40)

main_soup = analyze_website_structure()
test_direct_arabic_page()

print("\\n✅ Analysis complete. Check the ../data/ folder for saved HTML files.")

In [None]:
# Alternative: Manual Ramz Link Discovery

def discover_ramz_links_by_pattern():
    """Discover ramz links by testing known patterns"""
    print("🔍 Discovering ramz links using pattern matching...")
    
    # Based on your logs, we know the pattern is:
    # https://guide-orientation.rnu.tn/ar/dynamique/filiere.php?id=XXXXXXX
    # Where XXXXXXX appears to be: [bac_type][ramz_code]
    
    discovered_links = []
    
    # Known bac types and their prefixes (from your previous logs)
    bac_patterns = [
        {'bac_value': '1', 'bac_text': 'آداب', 'prefix': '1'},
        {'bac_value': '2', 'bac_text': 'رياضيات', 'prefix': '2'}, 
        {'bac_value': '3', 'bac_text': 'علوم تجريبية', 'prefix': '3'},
        {'bac_value': '4', 'bac_text': 'إقتصاد وتصرف', 'prefix': '4'},
        {'bac_value': '5', 'bac_text': 'العلوم التقنية', 'prefix': '5'},
        {'bac_value': '6', 'bac_text': 'علوم الإعلامية', 'prefix': '6'},
        {'bac_value': '7', 'bac_text': 'Other1', 'prefix': '7'},
        {'bac_value': '8', 'bac_text': 'Other2', 'prefix': '8'},
        {'bac_value': '9', 'bac_text': 'Other3', 'prefix': '9'}
    ]
    
    # Test known ramz codes (based on your previous successful runs)
    test_ramz_codes = [
        '10101', '10102', '10103', '10104', '10105', '10106', '10107', '10108',
        '10118', '10119', '10120', '10121', '10122', '10123', '10124', '10138',
        '10139', '10160', '10162', '10167', '10190', '10191', '10192', '10193',
        '10194', '10195', '10200', '10201', '10202', '10203'
    ]
    
    print(f"Testing {len(test_ramz_codes)} known ramz codes...")
    
    for bac in bac_patterns:
        bac_links = []
        
        for ramz_code in test_ramz_codes:
            # Try different ID patterns
            id_patterns = [
                f"{bac['prefix']}{ramz_code}",  # 110101
                f"{ramz_code}",                 # 10101  
                f"{bac['bac_value']}{ramz_code}" # 110101
            ]
            
            for ramz_id in id_patterns:
                url = f"{BASE_URL}/ar/dynamique/filiere.php?id={ramz_id}"
                
                try:
                    # Quick HEAD request to check if URL exists
                    response = session.head(url, timeout=5)
                    
                    if response.status_code == 200:
                        bac_links.append({
                            'ramz_code': ramz_code,
                            'ramz_id': ramz_id,
                            'url': url,
                            'bac_value': bac['bac_value'],
                            'bac_text': bac['bac_text']
                        })
                        print(f"✅ Found: {ramz_code} -> {url}")
                        break  # Found working pattern for this ramz_code
                        
                except Exception:
                    continue  # Try next pattern
                    
                time.sleep(0.1)  # Small delay
        
        if bac_links:
            print(f"Found {len(bac_links)} links for {bac['bac_text']}")
            discovered_links.extend(bac_links)
        
        time.sleep(1)  # Rate limiting between bac types
    
    print(f"\\n✅ Discovered {len(discovered_links)} ramz links using patterns")
    return discovered_links

def expand_ramz_range(base_links, max_range=1000):
    """Expand discovered ramz links by testing nearby codes"""
    print(f"🔍 Expanding ramz code range (testing up to {max_range} codes per bac type)...")
    
    expanded_links = list(base_links)  # Start with what we have
    
    # Group by bac type
    bac_groups = {}
    for link in base_links:
        bac_value = link['bac_value']
        if bac_value not in bac_groups:
            bac_groups[bac_value] = []
        bac_groups[bac_value].append(link)
    
    for bac_value, links in bac_groups.items():
        if not links:
            continue
            
        bac_text = links[0]['bac_text']
        print(f"Expanding {bac_text}...")
        
        # Get the ID pattern that worked
        sample_link = links[0]
        ramz_id = sample_link['ramz_id']
        ramz_code = sample_link['ramz_code']
        
        # Determine the prefix pattern
        prefix = ramz_id.replace(ramz_code, '')
        
        # Test a range of ramz codes
        start_code = int(ramz_code[:3] + '01')  # e.g., 10101 -> start from 10101
        end_code = start_code + max_range
        
        print(f"   Testing range {start_code} to {end_code} with prefix '{prefix}'")
        
        found_count = 0
        for test_code in range(start_code, end_code):
            test_ramz_code = str(test_code)
            test_ramz_id = f"{prefix}{test_ramz_code}"
            url = f"{BASE_URL}/ar/dynamique/filiere.php?id={test_ramz_id}"
            
            # Skip if we already have this one
            if any(link['ramz_code'] == test_ramz_code for link in expanded_links):
                continue
            
            try:
                response = session.head(url, timeout=3)
                
                if response.status_code == 200:
                    expanded_links.append({
                        'ramz_code': test_ramz_code,
                        'ramz_id': test_ramz_id,
                        'url': url,
                        'bac_value': bac_value,
                        'bac_text': bac_text
                    })
                    found_count += 1
                    
                    if found_count % 10 == 0:
                        print(f"      Found {found_count} new links...")
                        
            except Exception:
                continue
                
            time.sleep(0.05)  # Small delay
        
        print(f"   Added {found_count} new links for {bac_text}")
        time.sleep(2)  # Rate limiting between bac types
    
    new_total = len(expanded_links)
    added = new_total - len(base_links)
    print(f"\\n✅ Expansion complete: {added} new links added (total: {new_total})")
    
    return expanded_links

# Run the alternative discovery method
print("🔍 ALTERNATIVE RAMZ LINK DISCOVERY")
print("=" * 40)

# Discover using patterns
pattern_links = discover_ramz_links_by_pattern()

if pattern_links:
    print(f"\\n📊 Pattern discovery results:")
    for bac_value in set(link['bac_value'] for link in pattern_links):
        bac_links = [link for link in pattern_links if link['bac_value'] == bac_value]
        bac_text = bac_links[0]['bac_text'] if bac_links else 'Unknown'
        print(f"   {bac_text}: {len(bac_links)} links")
    
    # Ask user if they want to expand the range
    print(f"\\n🎯 Found {len(pattern_links)} ramz links using patterns.")
    print("You can now:")
    print("1. Use these links as-is for testing")
    print("2. Expand the range to find more links (slower)")
    print("3. Use the manual discovery as backup for the form-based method")
    
    # Save the discovered links
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    pattern_file = f"../data/pattern_discovered_ramz_{timestamp}.json"
    
    with open(pattern_file, 'w', encoding='utf-8') as f:
        json.dump(pattern_links, f, ensure_ascii=False, indent=2)
    
    print(f"📁 Pattern-discovered links saved to: {pattern_file}")
    
    # Store in variable for later use
    manual_ramz_links = pattern_links
    
else:
    print("❌ Pattern discovery failed")
    manual_ramz_links = []

In [None]:
# Step 1: Collect All Ramz Links (with Fallback Methods)

print("🚀 Starting comprehensive data collection...")
print("This cell will try multiple methods to collect ramz links")

# Method 1: Try form-based extraction
print("\n📋 Method 1: Form-based extraction")
bac_types = get_bac_types()

if not bac_types:
    print("❌ Failed to get baccalaureate types via forms")
    bac_types = []
else:
    print(f"✅ Found {len(bac_types)} baccalaureate types:")
    for i, bac in enumerate(bac_types, 1):
        print(f"   {i}. {bac['text']} (value: {bac['value']})")

# Collect ramz links for all bac types
all_ramz_links = []

if bac_types:
    print("\n🔗 Collecting ramz links via form submission...")
    
    for bac in tqdm(bac_types, desc="Processing bac types"):
        ramz_links = get_ramz_links_for_bac(bac['value'], bac['text'])
        if ramz_links:
            all_ramz_links.extend(ramz_links)
            print(f"   ✅ {bac['text']}: {len(ramz_links)} links")
        else:
            print(f"   ❌ {bac['text']}: No links found")
        time.sleep(2)  # Rate limiting

print(f"\n📊 Form-based method results: {len(all_ramz_links)} ramz links")

# Method 2: Use manual discovery if form method failed or found few links
if len(all_ramz_links) < 100:  # Threshold for "successful" extraction
    print("\n🔧 Method 2: Pattern-based discovery (fallback)")
    print("Form-based method didn't find enough links. Using alternative method...")
    
    if 'manual_ramz_links' in locals() and manual_ramz_links:
        print(f"✅ Using previously discovered {len(manual_ramz_links)} links from pattern method")
        all_ramz_links = manual_ramz_links
    else:
        print("Running pattern discovery now...")
        manual_links = discover_ramz_links_by_pattern()
        if manual_links:
            all_ramz_links = manual_links
            print(f"✅ Pattern method found {len(manual_links)} links")
        else:
            print("❌ Pattern method also failed")

# Method 3: Load from existing file if available
if len(all_ramz_links) == 0:
    print("\n📁 Method 3: Load from existing file")
    
    # Look for existing ramz link files
    import glob
    existing_files = glob.glob('../data/*ramz*.json')
    
    if existing_files:
        latest_file = max(existing_files, key=os.path.getctime)
        print(f"Found existing file: {latest_file}")
        
        try:
            with open(latest_file, 'r', encoding='utf-8') as f:
                file_links = json.load(f)
            
            if isinstance(file_links, list) and len(file_links) > 0:
                all_ramz_links = file_links
                print(f"✅ Loaded {len(file_links)} links from {latest_file}")
            else:
                print("❌ File doesn't contain valid ramz links")
                
        except Exception as e:
            print(f"❌ Error loading file: {e}")
    else:
        print("❌ No existing ramz link files found")

# Final results
print(f"\n🎯 FINAL RESULTS")
print("=" * 30)

if all_ramz_links:
    print(f"✅ Total ramz links collected: {len(all_ramz_links)}")
    
    # Analyze the collected links
    bac_distribution = {}
    for link in all_ramz_links:
        bac_text = link.get('bac_text', 'Unknown')
        bac_distribution[bac_text] = bac_distribution.get(bac_text, 0) + 1
    
    print(f"\n📊 Distribution by baccalaureate type:")
    for bac_text, count in sorted(bac_distribution.items(), key=lambda x: x[1], reverse=True):
        print(f"   {bac_text}: {count} specializations")
    
    # Save ramz links
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    ramz_links_file = f"../data/collected_ramz_links_{timestamp}.json"
    
    with open(ramz_links_file, 'w', encoding='utf-8') as f:
        json.dump(all_ramz_links, f, ensure_ascii=False, indent=2)
    
    print(f"\n📁 Ramz links saved to: {ramz_links_file}")
    
    # Show sample ramz links
    print(f"\n📋 Sample ramz links:")
    for i, link in enumerate(all_ramz_links[:5], 1):
        print(f"   {i}. {link['ramz_code']} ({link.get('bac_text', 'Unknown')}) -> {link['url']}")
    
    if len(all_ramz_links) > 5:
        print(f"   ... and {len(all_ramz_links) - 5} more")
    
    print(f"\n✅ Ready to proceed with scraping {len(all_ramz_links)} specializations!")
    
else:
    print("❌ Failed to collect any ramz links using all methods")
    print("\nTroubleshooting suggestions:")
    print("1. Check your internet connection")
    print("2. Verify the website is accessible")
    print("3. Run the website structure analysis cell")
    print("4. Check if the website structure has changed")

In [None]:
# Step 2: Scrape All Ramz Details in Parallel

async def scrape_all_ramz_parallel(ramz_links, max_concurrent=20):
    """Scrape all ramz details using async parallel processing"""
    print(f"⚡ Starting parallel scraping of {len(ramz_links)} ramz pages...")
    
    # Create semaphore to limit concurrent requests
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def fetch_with_semaphore(session, ramz_info):
        async with semaphore:
            await asyncio.sleep(DELAY)  # Rate limiting
            return await fetch_html_content(session, ramz_info)
    
    # Create aiohttp session
    timeout = aiohttp.ClientTimeout(total=30)
    async with aiohttp.ClientSession(timeout=timeout) as async_session:
        # Create tasks for all ramz links
        tasks = [
            fetch_with_semaphore(async_session, ramz_info) 
            for ramz_info in ramz_links
        ]
        
        # Execute all tasks with progress bar
        results = []
        failed = []
        
        # Process tasks in batches to avoid overwhelming
        batch_size = 100
        for i in range(0, len(tasks), batch_size):
            batch = tasks[i:i + batch_size]
            batch_results = await asyncio.gather(*batch, return_exceptions=True)
            
            for result in batch_results:
                if isinstance(result, Exception):
                    failed.append(str(result))
                elif result[1] is not None:  # HTML content received
                    results.append(result)
                else:
                    failed.append(result[0]['ramz_code'])
            
            print(f"Progress: {min(i + batch_size, len(tasks))}/{len(tasks)} "
                  f"({len(results)} successful, {len(failed)} failed)")
    
    print(f"\\n✅ Scraping completed: {len(results)} successful, {len(failed)} failed")
    return results, failed

# Only proceed if we have ramz links
if 'all_ramz_links' in locals() and all_ramz_links:
    # For testing, limit to first 50 ramz links
    test_links = all_ramz_links[:50]  # Remove this line for full scraping
    
    print(f"🧪 Testing with {len(test_links)} ramz links (modify to scrape all {len(all_ramz_links)})...")
    
    # Run the async scraping
    scraped_results, failed_ramz = await scrape_all_ramz_parallel(test_links, max_concurrent=15)
    
    print(f"\\n📊 Results summary:")
    print(f"   - Successfully scraped: {len(scraped_results)}")
    print(f"   - Failed: {len(failed_ramz)}")
    
    # Save raw HTML results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    html_results_file = f"../data/raw_html_results_{timestamp}.json"
    
    # Prepare data for saving (ramz_info + HTML)
    html_data = []
    for ramz_info, html_content in scraped_results:
        html_data.append({
            'ramz_info': ramz_info,
            'html_content': html_content,
            'scraped_at': datetime.now().isoformat()
        })
    
    with open(html_results_file, 'w', encoding='utf-8') as f:
        json.dump(html_data, f, ensure_ascii=False, indent=2)
    
    print(f"📁 Raw HTML results saved to: {html_results_file}")
    
else:
    print("❌ No ramz links available. Run the previous cell first.")

In [None]:
# Step 3: Parse All Scraped HTML Data

def parse_all_scraped_data(scraped_results):
    """Parse all scraped HTML data into structured format"""
    print(f"🔍 Parsing {len(scraped_results)} scraped HTML pages...")
    
    parsed_data = []
    parsing_errors = []
    
    for ramz_info, html_content in tqdm(scraped_results, desc="Parsing HTML"):
        if html_content:
            try:
                detail = parse_ramz_detail_page(ramz_info, html_content)
                if detail:
                    parsed_data.append(detail)
                else:
                    parsing_errors.append(ramz_info['ramz_code'])
            except Exception as e:
                parsing_errors.append(f"{ramz_info['ramz_code']}: {str(e)}")
        else:
            parsing_errors.append(f"{ramz_info['ramz_code']}: No HTML content")
    
    print(f"\\n✅ Parsing completed:")
    print(f"   - Successfully parsed: {len(parsed_data)}")
    print(f"   - Parsing errors: {len(parsing_errors)}")
    
    return parsed_data, parsing_errors

# Parse the scraped data
if 'scraped_results' in locals() and scraped_results:
    parsed_specializations, parse_errors = parse_all_scraped_data(scraped_results)
    
    if parsed_specializations:
        print(f"\\n📊 Sample parsed data for ramz {parsed_specializations[0].ramz_code}:")
        sample = parsed_specializations[0]
        
        sample_fields = [
            ('University', sample.university),
            ('Governorate', sample.governorate), 
            ('Institution', sample.institution),
            ('Address', sample.address),
            ('Phone', sample.phone),
            ('Specialization', sample.specialization),
            ('Training Field', sample.training_field),
            ('Measure', sample.measure),
            ('Bac Type', sample.bac_type),
            ('Capacity 2025', sample.capacity_2025),
            ('Requires Test', sample.requires_test),
            ('Geographic Distribution', sample.geographic_distribution),
            ('Conditions', sample.conditions),
            ('Study Duration', sample.study_duration),
            ('Last Score 2024', sample.last_oriented_score_2024),
            ('Score History', str(sample.score_history))
        ]
        
        for field, value in sample_fields:
            display_value = value[:50] + "..." if len(str(value)) > 50 else value
            print(f"   {field}: {display_value}")
        
        # Calculate field completeness
        print(f"\\n📈 Data completeness analysis:")
        field_counts = {}
        total_records = len(parsed_specializations)
        
        for spec in parsed_specializations:
            for field, value in asdict(spec).items():
                if value and str(value).strip():
                    field_counts[field] = field_counts.get(field, 0) + 1
        
        # Sort by completeness
        sorted_fields = sorted(field_counts.items(), key=lambda x: x[1], reverse=True)
        
        for field, count in sorted_fields:
            percentage = (count / total_records) * 100
            print(f"   {field}: {count}/{total_records} ({percentage:.1f}%)")
    
    else:
        print("❌ No data was successfully parsed")
        
else:
    print("❌ No scraped results available. Run the scraping cell first.")

In [None]:
# Step 4: Save Parsed Data to CSV and JSON

def save_results_to_files(parsed_data, filename_prefix="tunisia_university_data"):
    """Save parsed data to both CSV and JSON formats"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Convert to list of dictionaries
    data_dicts = [asdict(spec) for spec in parsed_data]
    
    # Save as JSON
    json_filename = f"../data/{filename_prefix}_{timestamp}.json"
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(data_dicts, f, ensure_ascii=False, indent=2)
    
    # Save as CSV
    csv_filename = f"../data/{filename_prefix}_{timestamp}.csv"
    if data_dicts:
        df = pd.DataFrame(data_dicts)
        
        # Convert score_history dict to JSON string for CSV
        df['score_history'] = df['score_history'].apply(lambda x: json.dumps(x, ensure_ascii=False))
        
        # Save to CSV
        df.to_csv(csv_filename, index=False, encoding='utf-8')
        
        # Display DataFrame info
        print(f"📊 DataFrame shape: {df.shape}")
        print(f"📁 Files saved:")
        print(f"   - JSON: {json_filename}")
        print(f"   - CSV: {csv_filename}")
        
        return df, json_filename, csv_filename
    
    return None, json_filename, csv_filename

# Save the results
if 'parsed_specializations' in locals() and parsed_specializations:
    print(f"💾 Saving {len(parsed_specializations)} parsed specializations...")
    
    df_results, json_file, csv_file = save_results_to_files(parsed_specializations)
    
    if df_results is not None:
        print(f"\\n✅ Data successfully saved!")
        
        # Display basic statistics
        print(f"\\n📊 Dataset Overview:")
        print(f"   - Total records: {len(df_results)}")
        print(f"   - Total columns: {len(df_results.columns)}")
        print(f"   - Memory usage: {df_results.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        
        # Show column data types
        print(f"\\n📋 Column Information:")
        for col in df_results.columns:
            non_null_count = df_results[col].notna().sum()
            data_type = df_results[col].dtype
            print(f"   {col}: {non_null_count}/{len(df_results)} non-null, {data_type}")
        
        # Show universities distribution
        if 'university' in df_results.columns:
            university_counts = df_results['university'].value_counts().head(10)
            print(f"\\n🏛️ Top 10 Universities by number of specializations:")
            for univ, count in university_counts.items():
                if univ and univ.strip():
                    print(f"   {univ}: {count} specializations")
        
        # Show bac types distribution
        if 'bac_type' in df_results.columns:
            bac_counts = df_results['bac_type'].value_counts()
            print(f"\\n🎓 Baccalaureate Types Distribution:")
            for bac, count in bac_counts.items():
                if bac and bac.strip():
                    print(f"   {bac}: {count} specializations")
    
    else:
        print("❌ Failed to save data")
        
else:
    print("❌ No parsed data available. Run the parsing cell first.")

In [None]:
# Step 5: Data Analysis and Visualization

import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

def analyze_scraped_data(df):
    """Perform comprehensive analysis of the scraped data"""
    
    if df is None or df.empty:
        print("❌ No data available for analysis")
        return
    
    print("📊 COMPREHENSIVE DATA ANALYSIS")
    print("=" * 50)
    
    # 1. Basic Statistics
    print(f"\\n📈 Basic Statistics:")
    print(f"   Total Specializations: {len(df)}")
    print(f"   Unique Universities: {df['university'].nunique()}")
    print(f"   Unique Institutions: {df['institution'].nunique()}")
    print(f"   Unique Bac Types: {df['bac_type'].nunique()}")
    
    # 2. Data Completeness Heatmap
    plt.figure(figsize=(12, 8))
    
    # Calculate completeness for each field
    completeness = {}
    for col in df.columns:
        if col not in ['score_history', 'extraction_timestamp']:
            non_empty = df[col].notna() & (df[col] != '') & (df[col] != '-')
            completeness[col] = non_empty.sum() / len(df) * 100
    
    # Create completeness visualization
    plt.subplot(2, 2, 1)
    fields = list(completeness.keys())
    values = list(completeness.values())
    
    plt.barh(fields, values, color=sns.color_palette("viridis", len(fields)))
    plt.xlabel('Completeness (%)')
    plt.title('Data Completeness by Field')
    plt.grid(axis='x', alpha=0.3)
    
    # 3. University Distribution
    plt.subplot(2, 2, 2)
    top_universities = df['university'].value_counts().head(10)
    if not top_universities.empty:
        plt.pie(top_universities.values, labels=top_universities.index, autopct='%1.1f%%')
        plt.title('Top 10 Universities Distribution')
    
    # 4. Bac Type Distribution
    plt.subplot(2, 2, 3)
    bac_counts = df['bac_type'].value_counts()
    if not bac_counts.empty:
        plt.bar(range(len(bac_counts)), bac_counts.values)
        plt.xticks(range(len(bac_counts)), bac_counts.index, rotation=45, ha='right')
        plt.title('Specializations by Baccalaureate Type')
        plt.ylabel('Count')
    
    # 5. Score Analysis (if available)
    plt.subplot(2, 2, 4)
    if 'last_oriented_score_2024' in df.columns:
        # Convert scores to numeric
        scores = pd.to_numeric(df['last_oriented_score_2024'], errors='coerce')
        scores = scores.dropna()
        
        if not scores.empty:
            plt.hist(scores, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
            plt.xlabel('Last Oriented Score 2024')
            plt.ylabel('Frequency')
            plt.title('Distribution of Admission Scores 2024')
            plt.grid(alpha=0.3)
        else:
            plt.text(0.5, 0.5, 'No score data available', ha='center', va='center', transform=plt.gca().transAxes)
    
    plt.tight_layout()
    plt.show()
    
    # 6. Detailed Statistics
    print(f"\\n📋 Detailed Field Analysis:")
    for field, completeness_pct in sorted(completeness.items(), key=lambda x: x[1], reverse=True):
        print(f"   {field}: {completeness_pct:.1f}% complete")
    
    # 7. Sample Records
    print(f"\\n📄 Sample Records (first 3):")
    for i, (idx, row) in enumerate(df.head(3).iterrows()):
        print(f"\\n   Record {i+1} (Ramz: {row.get('ramz_code', 'N/A')}):")
        for field in ['university', 'institution', 'specialization', 'bac_type', 'last_oriented_score_2024']:
            value = row.get(field, 'N/A')
            display_value = value[:60] + "..." if len(str(value)) > 60 else value
            print(f"      {field}: {display_value}")

# Run the analysis
if 'df_results' in locals() and df_results is not None:
    analyze_scraped_data(df_results)
    
    print(f"\\n🎉 SCRAPING PROJECT COMPLETED!")
    print(f"=" * 50)
    print(f"✅ Successfully scraped and analyzed Tunisian university data")
    print(f"📊 Total records: {len(df_results)}")
    print(f"📁 Data saved to CSV and JSON formats")
    print(f"🔍 Analysis and visualizations generated")
    
    print(f"\\n📝 Next Steps:")
    print(f"   1. Review the generated CSV file for data quality")
    print(f"   2. Perform additional analysis as needed")
    print(f"   3. Use the data for your research or application")
    
else:
    print("❌ No data available for analysis. Please run all previous cells first.")