In [31]:
# Step 1: Download CVs from CRM and update database
print("[CRM] Starting CRM fetch and download process...")
def get_session():
    session = requests.Session()
    session.auth = HttpNtlmAuth(USERNAME, PASSWORD)
    session.headers.update({
        "Accept": "application/json",
        "OData-MaxVersion": "4.0",
        "OData-Version": "4.0"
    })
    return session

def get_last_extracted_date():
    conn = psycopg2.connect(**DB_CONFIG)
    cursor = conn.cursor()
    try:
        cursor.execute("SELECT MAX(createdon) FROM pdf_extracted_data")
        last_date = cursor.fetchone()[0]
        if last_date:
            print(f"[CRM] Last extracted date from DB: {last_date}")
            return last_date
        else:
            print("[CRM] No previous extraction found, defaulting to 3 days ago.")
            return datetime.now() - timedelta(days=3)
    except Exception as e:
        print(f"[CRM][ERROR] Error fetching last extracted date: {e}")
        return datetime.now() - timedelta(days=3)
    finally:
        cursor.close()
        conn.close()

def get_job_applications(session):
    all_applications = []
    start_date = get_last_extracted_date()
    end_date = datetime.now()
    print(f"[CRM] Fetching applications from {start_date} to {end_date.date()}")
    filter_condition = (
        f"createdon ge {start_date.strftime('%Y-%m-%dT00:00:00Z')} "
        f"and createdon le {end_date.strftime('%Y-%m-%dT23:59:59Z')}"
    )
    url = (
        f"{CRM_URL}/new_jobapplications?"
        f"$select=new_jobapplicationid,new_name,new_email,new_jauid,createdon&"
        f"$filter={quote(filter_condition)}&"
        "$top=5000"
    )
    try:
        response = session.get(url)
        response.raise_for_status()
        data = response.json()
        if "value" in data:
            applications = data["value"]
            print(f"[CRM] Found {len(applications)} applications since last extraction")
            for app in applications:
                annotations_url = (
                    f"{CRM_URL}/annotations?"
                    f"$filter=_objectid_value eq {app['new_jobapplicationid']}&"
                    "$select=filename,mimetype,documentbody"
                )
                try:
                    annotations_response = session.get(annotations_url)
                    annotations_response.raise_for_status()
                    annotations_data = annotations_response.json()
                    if "value" in annotations_data:
                        app["annotations"] = annotations_data["value"]
                    else:
                        app["annotations"] = []
                except requests.exceptions.RequestException as e:
                    print(f"[CRM][ERROR] Error fetching annotations for application {app['new_jobapplicationid']}: {str(e)}")
                    app["annotations"] = []
            all_applications.extend(applications)
            print(f"[CRM] Total applications fetched: {len(all_applications)}")
    except requests.exceptions.RequestException as e:
        print(f"[CRM][ERROR] Error fetching applications: {str(e)}")
    return all_applications

def download_attachments(applications):
    if not os.path.exists(DOWNLOAD_DIR):
        os.makedirs(DOWNLOAD_DIR)
    downloaded = 0
    skipped = 0
    for app in applications:
        try:
            app_id = app.get("new_jobapplicationid")
            created_date = app.get("createdon", "").split("T")[0]
            annotations = app.get("annotations", [])
            for annotation in annotations:
                if "documentbody" in annotation and "filename" in annotation:
                    filename = annotation["filename"]
                    if filename.lower().endswith((".pdf", ".doc", ".docx")):
                        try:
                            unique_filename = f"{created_date}_{filename}"
                            file_path = os.path.join(DOWNLOAD_DIR, unique_filename)
                            if os.path.exists(file_path):
                                print(f"[CRM] Skipping existing file: {unique_filename}")
                                skipped += 1
                                continue
                            file_content = base64.b64decode(annotation["documentbody"])
                            with open(file_path, "wb") as f:
                                f.write(file_content)
                            print(f"[CRM] Downloaded: {unique_filename}")
                            downloaded += 1
                        except Exception as e:
                            print(f"[CRM][ERROR] Error saving file {filename}: {e}")
        except Exception as e:
            print(f"[CRM][ERROR] Error processing application {app.get('new_jobapplicationid')}: {e}")
    print(f"[CRM] Downloaded {downloaded} new files, skipped {skipped} existing files.")
    return downloaded, skipped

def update_db_with_extra_columns(applications):
    conn = psycopg2.connect(**DB_CONFIG)
    cursor = conn.cursor()
    updated = 0
    try:
        for app in applications:
            app_id = app.get("new_jobapplicationid")
            name = app.get("new_name")
            email = app.get("new_email")
            jauid = app.get("new_jauid")
            if not app_id:
                continue
            cursor.execute("""
                UPDATE pdf_extracted_data SET
                    crm_name2 = %s,
                    crm_email2 = %s,
                    crm_jauid2 = %s
                WHERE crm_applicationid = %s
            """, (name, email, jauid, app_id))
            if cursor.rowcount > 0:
                updated += cursor.rowcount
        conn.commit()
        print(f"[CRM] Updated {updated} rows with extra CRM columns.")
    except Exception as e:
        print(f"[CRM][ERROR] Error updating database: {e}")
        conn.rollback()
    finally:
        cursor.close()
        conn.close()

print("[CRM] Fetching and downloading from CRM complete.\n")

# Step 2: Extract OCR text from PDFs and store in database
print("[OCR] Starting OCR extraction for new PDFs...")
def setup_database():
    try:
        conn = psycopg2.connect(**DB_CONFIG)
        cursor = conn.cursor()
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS pdf_extracted_data (
                id SERIAL PRIMARY KEY,
                pdf_filename VARCHAR(255),
                ocr_result TEXT,
                name VARCHAR(255),
                email VARCHAR(255),
                phone VARCHAR(255),
                linkedin VARCHAR(255),
                graduation_year VARCHAR(255),
                university VARCHAR(255),
                skills TEXT,
                department VARCHAR(255),
                job_title VARCHAR(255),
                years_of_experience VARCHAR(50),
                current_company VARCHAR(255),
                location VARCHAR(255),
                languages TEXT,
                certifications TEXT,
                project_types TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)
        conn.commit()
        return conn, cursor
    except Exception as e:
        print(f"[OCR][ERROR] Database connection error: {e}")
        return None, None

def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(
            pdf_path,
            poppler_path=POPPLER_PATH
        )
        text = ""
        for img in images:
            try:
                text += pytesseract.image_to_string(img) + "\n"
            except Exception as e:
                print(f"[OCR][ERROR] OCR error on page: {e}")
                continue
        return text.strip()
    except Exception as e:
        print(f"[OCR][ERROR] Error processing PDF {pdf_path}: {e}")
        return None

def process_pdfs_in_directory(directory_path):
    conn, cursor = setup_database()
    if not conn or not cursor:
        print("[OCR][ERROR] Failed to setup database connection. Exiting.")
        return
    os.makedirs(directory_path, exist_ok=True)
    conn.autocommit = False
    try:
        cursor.execute("SELECT pdf_filename FROM pdf_extracted_data WHERE ocr_result IS NOT NULL")
        processed_files = {row[0] for row in cursor.fetchall()}
        print(f"[OCR] Found {len(processed_files)} already processed files in database")
        new_files = 0
        failed_files = 0
        for filename in os.listdir(directory_path):
            if filename.endswith(".pdf"):
                if filename in processed_files:
                    print(f"[OCR] Skipping already processed file: {filename}")
                    continue
                file_path = os.path.join(directory_path, filename)
                print(f"[OCR] Processing new file: {filename}")
                new_files += 1
                ocr_text = extract_text_from_pdf(file_path)
                if not ocr_text:
                    print(f"[OCR][ERROR] Failed to extract text from {filename}")
                    failed_files += 1
                    continue
                try:
                    cursor.execute("SELECT id FROM pdf_extracted_data WHERE pdf_filename = %s", (filename,))
                    if cursor.fetchone():
                        print(f"[OCR] File was processed by another process while we were working: {filename}")
                        continue
                    cursor.execute(
                        """
                        INSERT INTO pdf_extracted_data 
                        (pdf_filename, ocr_result)
                        VALUES (%s, %s)
                        """,
                        (filename, ocr_text)
                    )
                    conn.commit()
                    print(f"[OCR] ✓ Successfully saved OCR text for: {filename}")
                except Exception as e:
                    print(f"[OCR][ERROR] Database insertion error for {filename}: {e}")
                    conn.rollback()
                    failed_files += 1
                    continue
        print(f"[OCR] OCR Processing completed: {new_files - failed_files} files processed, {failed_files} failed, {new_files} total attempted.")
    except Exception as e:
        print(f"[OCR][ERROR] Error during processing: {e}")
        conn.rollback()
    finally:
        conn.autocommit = True
        cursor.close()
        conn.close()
print("[OCR] OCR extraction complete.\n") 

[CRM] Starting CRM fetch and download process...
[CRM] Fetching and downloading from CRM complete.

[OCR] Starting OCR extraction for new PDFs...
[OCR] OCR extraction complete.



In [32]:
# Step 1: Download CVs from CRM and update database (last 3 days, with detailed logging)
print("[CRM] Starting CRM fetch and download process...")
from datetime import datetime, timedelta

def get_session():
    session = requests.Session()
    session.auth = HttpNtlmAuth(USERNAME, PASSWORD)
    session.headers.update({
        "Accept": "application/json",
        "OData-MaxVersion": "4.0",
        "OData-Version": "4.0"
    })
    return session

# Always fetch the last 3 days (not using MAX(createdon))
def get_job_applications(session):
    all_applications = []
    end_date = datetime.now()
    start_date = end_date - timedelta(days=3)
    print(f"[CRM] Fetching applications from {start_date} to {end_date.date()}")
    filter_condition = (
        f"createdon ge {start_date.strftime('%Y-%m-%dT00:00:00Z')} "
        f"and createdon le {end_date.strftime('%Y-%m-%dT23:59:59Z')}"
    )
    url = (
        f"{CRM_URL}/new_jobapplications?"
        f"$select=new_jobapplicationid,new_name,new_email,new_jauid,createdon&"
        f"$filter={quote(filter_condition)}&"
        "$top=5000"
    )
    try:
        response = session.get(url)
        response.raise_for_status()
        data = response.json()
        if "value" in data:
            applications = data["value"]
            print(f"[CRM] Found {len(applications)} applications in the last 3 days")
            for app in applications:
                print(f"  [CRM] AppID: {app.get('new_jobapplicationid')} | Name: {app.get('new_name')} | Created: {app.get('createdon')}")
                annotations_url = (
                    f"{CRM_URL}/annotations?"
                    f"$filter=_objectid_value eq {app['new_jobapplicationid']}&"
                    "$select=filename,mimetype,documentbody"
                )
                try:
                    annotations_response = session.get(annotations_url)
                    annotations_response.raise_for_status()
                    annotations_data = annotations_response.json()
                    if "value" in annotations_data:
                        app["annotations"] = annotations_data["value"]
                        for annotation in annotations_data["value"]:
                            print(f"    [CRM] Annotation filename: {annotation.get('filename')}")
                    else:
                        app["annotations"] = []
                except requests.exceptions.RequestException as e:
                    print(f"[CRM][ERROR] Error fetching annotations for application {app['new_jobapplicationid']}: {str(e)}")
                    app["annotations"] = []
            all_applications.extend(applications)
            print(f"[CRM] Total applications fetched: {len(all_applications)}")
    except requests.exceptions.RequestException as e:
        print(f"[CRM][ERROR] Error fetching applications: {str(e)}")
    return all_applications

def get_last_extracted_date():
    conn = psycopg2.connect(**DB_CONFIG)
    cursor = conn.cursor()
    try:
        cursor.execute("SELECT MAX(createdon) FROM pdf_extracted_data")
        last_date = cursor.fetchone()[0]
        if last_date:
            print(f"[CRM] Last extracted date from DB: {last_date}")
            return last_date
        else:
            print("[CRM] No previous extraction found, defaulting to 3 days ago.")
            return datetime.now() - timedelta(days=3)
    except Exception as e:
        print(f"[CRM][ERROR] Error fetching last extracted date: {e}")
        return datetime.now() - timedelta(days=3)
    finally:
        cursor.close()
        conn.close()

def download_attachments(applications):
    if not os.path.exists(DOWNLOAD_DIR):
        os.makedirs(DOWNLOAD_DIR)
    downloaded = 0
    skipped = 0
    for app in applications:
        try:
            app_id = app.get("new_jobapplicationid")
            created_date = app.get("createdon", "").split("T")[0]
            annotations = app.get("annotations", [])
            for annotation in annotations:
                if "documentbody" in annotation and "filename" in annotation:
                    filename = annotation["filename"]
                    if filename.lower().endswith((".pdf", ".doc", ".docx")):
                        try:
                            unique_filename = f"{created_date}_{filename}"
                            file_path = os.path.join(DOWNLOAD_DIR, unique_filename)
                            if os.path.exists(file_path):
                                print(f"[CRM] Skipping existing file: {unique_filename}")
                                skipped += 1
                                continue
                            file_content = base64.b64decode(annotation["documentbody"])
                            with open(file_path, "wb") as f:
                                f.write(file_content)
                            print(f"[CRM] Downloaded: {unique_filename}")
                            downloaded += 1
                        except Exception as e:
                            print(f"[CRM][ERROR] Error saving file {filename}: {e}")
        except Exception as e:
            print(f"[CRM][ERROR] Error processing application {app.get('new_jobapplicationid')}: {e}")
    print(f"[CRM] Downloaded {downloaded} new files, skipped {skipped} existing files.")
    return downloaded, skipped

def update_db_with_extra_columns(applications):
    conn = psycopg2.connect(**DB_CONFIG)
    cursor = conn.cursor()
    updated = 0
    try:
        for app in applications:
            app_id = app.get("new_jobapplicationid")
            name = app.get("new_name")
            email = app.get("new_email")
            jauid = app.get("new_jauid")
            if not app_id:
                continue
            cursor.execute("""
                UPDATE pdf_extracted_data SET
                    crm_name2 = %s,
                    crm_email2 = %s,
                    crm_jauid2 = %s
                WHERE crm_applicationid = %s
            """, (name, email, jauid, app_id))
            if cursor.rowcount > 0:
                updated += cursor.rowcount
        conn.commit()
        print(f"[CRM] Updated {updated} rows with extra CRM columns.")
    except Exception as e:
        print(f"[CRM][ERROR] Error updating database: {e}")
        conn.rollback()
    finally:
        cursor.close()
        conn.close()

print("[CRM] Fetching and downloading from CRM complete.\n")

# Step 2: Extract OCR text from PDFs and store in database
print("[OCR] Starting OCR extraction for new PDFs...")
def setup_database():
    try:
        conn = psycopg2.connect(**DB_CONFIG)
        cursor = conn.cursor()
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS pdf_extracted_data (
                id SERIAL PRIMARY KEY,
                pdf_filename VARCHAR(255),
                ocr_result TEXT,
                name VARCHAR(255),
                email VARCHAR(255),
                phone VARCHAR(255),
                linkedin VARCHAR(255),
                graduation_year VARCHAR(255),
                university VARCHAR(255),
                skills TEXT,
                department VARCHAR(255),
                job_title VARCHAR(255),
                years_of_experience VARCHAR(50),
                current_company VARCHAR(255),
                location VARCHAR(255),
                languages TEXT,
                certifications TEXT,
                project_types TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)
        conn.commit()
        return conn, cursor
    except Exception as e:
        print(f"[OCR][ERROR] Database connection error: {e}")
        return None, None

def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(
            pdf_path,
            poppler_path=POPPLER_PATH
        )
        text = ""
        for img in images:
            try:
                text += pytesseract.image_to_string(img) + "\n"
            except Exception as e:
                print(f"[OCR][ERROR] OCR error on page: {e}")
                continue
        return text.strip()
    except Exception as e:
        print(f"[OCR][ERROR] Error processing PDF {pdf_path}: {e}")
        return None

def process_pdfs_in_directory(directory_path):
    conn, cursor = setup_database()
    if not conn or not cursor:
        print("[OCR][ERROR] Failed to setup database connection. Exiting.")
        return
    os.makedirs(directory_path, exist_ok=True)
    conn.autocommit = False
    try:
        cursor.execute("SELECT pdf_filename FROM pdf_extracted_data WHERE ocr_result IS NOT NULL")
        processed_files = {row[0] for row in cursor.fetchall()}
        print(f"[OCR] Found {len(processed_files)} already processed files in database")
        new_files = 0
        failed_files = 0
        for filename in os.listdir(directory_path):
            if filename.endswith(".pdf"):
                if filename in processed_files:
                    print(f"[OCR] Skipping already processed file: {filename}")
                    continue
                file_path = os.path.join(directory_path, filename)
                print(f"[OCR] Processing new file: {filename}")
                new_files += 1
                ocr_text = extract_text_from_pdf(file_path)
                if not ocr_text:
                    print(f"[OCR][ERROR] Failed to extract text from {filename}")
                    failed_files += 1
                    continue
                try:
                    cursor.execute("SELECT id FROM pdf_extracted_data WHERE pdf_filename = %s", (filename,))
                    if cursor.fetchone():
                        print(f"[OCR] File was processed by another process while we were working: {filename}")
                        continue
                    cursor.execute(
                        """
                        INSERT INTO pdf_extracted_data 
                        (pdf_filename, ocr_result)
                        VALUES (%s, %s)
                        """,
                        (filename, ocr_text)
                    )
                    conn.commit()
                    print(f"[OCR] ✓ Successfully saved OCR text for: {filename}")
                except Exception as e:
                    print(f"[OCR][ERROR] Database insertion error for {filename}: {e}")
                    conn.rollback()
                    failed_files += 1
                    continue
        print(f"[OCR] OCR Processing completed: {new_files - failed_files} files processed, {failed_files} failed, {new_files} total attempted.")
    except Exception as e:
        print(f"[OCR][ERROR] Error during processing: {e}")
        conn.rollback()
    finally:
        conn.autocommit = True
        cursor.close()
        conn.close()
print("[OCR] OCR extraction complete.\n") 

[CRM] Starting CRM fetch and download process...
[CRM] Fetching and downloading from CRM complete.

[OCR] Starting OCR extraction for new PDFs...
[OCR] OCR extraction complete.



In [33]:
# Step 3: Use GPT to extract structured info from OCR text
print("[GPT] Starting GPT extraction for unprocessed records...")
def extract_info_with_gpt(ocr_text):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{
                "role": "system",
                "content": """Extract the following information from the CV text in JSON format:\n                    - name: Full name of the candidate\n                    - email: Email address\n                    - phone: Phone number\n                    - linkedin: LinkedIn profile URL if present\n                    - graduation_year: Year of graduation\n                    - university: University name\n                    - skills: List of technical and soft skills\n                    - department: Normalize to one of: [Engineering, IT/Software Development, Sales, Marketing, HR, Finance/Accounting, Operations, Legal, Administrative, Other]\n                    - job_title: Normalize to closest match of: [Software Engineer, Project Manager, Business Analyst, Sales Representative, Marketing Specialist, HR Manager, Financial Analyst, Operations Manager, Legal Counsel, Administrative Assistant]\n                    - years_of_experience: Total years of experience\n                    - current_company: Current or most recent company\n                    - location: City/Country\n                    - languages: List of languages known\n                    - certifications: List of certifications\n                    - project_types: Types of projects worked on"""
            }, {
                "role": "user",
                "content": ocr_text[:4000]
            }],
            temperature=0.1,
            max_tokens=1000
        )
        result = json.loads(response.choices[0].message.content)
        return result
    except Exception as e:
        print(f"[GPT][ERROR] GPT extraction error: {e}")
        return None

def process_unprocessed_records():
    conn, cursor = setup_database()
    if not conn or not cursor:
        print("[GPT][ERROR] Could not connect to database.")
        return
    try:
        cursor.execute("""
            SELECT id, pdf_filename, ocr_result 
            FROM pdf_extracted_data 
            WHERE ocr_result IS NOT NULL 
            AND (name IS NULL OR department IS NULL OR job_title IS NULL)
        """)
        records = cursor.fetchall()
        print(f"[GPT] Found {len(records)} records to process with GPT.")
        processed = 0
        failed = 0
        for record_id, filename, ocr_text in records:
            print(f"[GPT] Processing: {filename}")
            info = extract_info_with_gpt(ocr_text)
            if not info:
                print(f"[GPT][ERROR] Failed to extract information for {filename}")
                failed += 1
                continue
            try:
                cursor.execute("""
                    UPDATE pdf_extracted_data 
                    SET name = %s,
                        email = %s,
                        phone = %s,
                        linkedin = %s,
                        graduation_year = %s,
                        university = %s,
                        skills = %s,
                        department = %s,
                        job_title = %s,
                        years_of_experience = %s,
                        current_company = %s,
                        location = %s,
                        languages = %s,
                        certifications = %s,
                        project_types = %s
                    WHERE id = %s
                """, (
                    info.get('name', ''),
                    info.get('email', ''),
                    info.get('phone', ''),
                    info.get('linkedin', ''),
                    info.get('graduation_year', ''),
                    info.get('university', ''),
                    json.dumps(info.get('skills', [])),
                    info.get('department', ''),
                    info.get('job_title', ''),
                    info.get('years_of_experience', ''),
                    info.get('current_company', ''),
                    info.get('location', ''),
                    json.dumps(info.get('languages', [])),
                    json.dumps(info.get('certifications', [])),
                    json.dumps(info.get('project_types', [])),
                    record_id
                ))
                conn.commit()
                processed += 1
                print(f"[GPT] ✓ Successfully processed {filename}")
                print(f"      Department: {info.get('department', 'N/A')}")
                print(f"      Job Title: {info.get('job_title', 'N/A')}")
            except Exception as e:
                print(f"[GPT][ERROR] Database update error for {filename}: {e}")
                conn.rollback()
                failed += 1
        print(f"[GPT] Processing completed.")
        print(f"[GPT] ✓ Successfully processed: {processed} records")
        print(f"[GPT] ❌ Failed to process: {failed} records")
        print(f"[GPT] Total cost estimate: ${(processed + failed) * 0.003:.2f}")
    except Exception as e:
        print(f"[GPT][ERROR] Error during processing: {e}")
        conn.rollback()
    finally:
        cursor.close()
        conn.close()
print("[GPT] GPT extraction complete.\n") 

[GPT] Starting GPT extraction for unprocessed records...
[GPT] GPT extraction complete.



In [34]:
# Step 4: Add extra CRM fields to the database (last 3 days only, by normalized filename, only fill NULL fields)
import os
import requests
from requests_ntlm import HttpNtlmAuth
from dotenv import load_dotenv
import psycopg2
from datetime import datetime, timedelta

# Load environment variables
load_dotenv()

CRM_URL = "https://rmecrm.rowad-rme.com/RMECRM/api/data/v8.2"
USERNAME = "Rowad\\Omar Essam"
PASSWORD = "PMO@1234"

DB_CONFIG = {
    "host": "localhost",
    "database": "postgres",
    "user": "postgres",
    "password": "PMO@1234"
}

# List of CRM fields to extract and update
CRM_FIELDS = [
    "new_jobapplicationid",
    "new_fullname",
    "new_contactphone",
    "new_telephonenumber",
    "new_jauid",
    "new_jobofferstatus",
    "new_gender",
    "new_position",
    "new_employmenttype",
    "new_expectedsalary",
    "new_dateavailableforemployment",
    "new_currentsalary",
    "new_company",
    "new_graduationyear",
    "new_qualitiesattributes",
    "new_careergoals",
    "new_additionalinformation",
    "new_appstatus",
    "new_hrinterviewstatus",
    "new_technicalrating",
    "new_technicalinterviewcomments",
    "new_hrcomment",
    "createdon",
    "modifiedon",
    "new_howdidyouhearaboutrowad",
    "new_listouttheextrasocialactivities",
    "new_pleasesepcify"
]

# Map CRM fields to DB columns (prefix with crm_)
CRM_TO_DB = {field: f"crm_{field[4:]}" if field.startswith("new_") else f"crm_{field}" for field in CRM_FIELDS}

# Add columns to DB if missing
def add_crm_columns():
    with psycopg2.connect(**DB_CONFIG) as conn:
        with conn.cursor() as cur:
            for crm_field, db_col in CRM_TO_DB.items():
                cur.execute(f"""
                    DO $$
                    BEGIN
                        IF NOT EXISTS (
                            SELECT 1 FROM information_schema.columns 
                            WHERE table_name = 'pdf_extracted_data' 
                            AND column_name = '{db_col}'
                        ) THEN
                            ALTER TABLE pdf_extracted_data ADD COLUMN {db_col} TEXT;
                        END IF;
                    END $$;
                """)
            conn.commit()
    print("[CRM] Ensured all extra CRM columns exist in the database.")

# Fetch CRM data and annotation filenames (last 3 days only)
def fetch_crm_data():
    session = requests.Session()
    session.auth = HttpNtlmAuth(USERNAME, PASSWORD)
    session.headers.update({
        "Accept": "application/json",
        "OData-MaxVersion": "4.0",
        "OData-Version": "4.0"
    })
    today = datetime.utcnow().date()
    days_ago = today - timedelta(days=3)
    filter_condition = f"createdon ge {days_ago}T00:00:00Z and createdon le {today}T23:59:59Z"
    url = f"{CRM_URL}/new_jobapplications?$select={','.join(CRM_FIELDS)}&$filter={filter_condition}&$top=5000"
    print(f"[CRM] Fetching job applications from {days_ago} to {today} with fields: {', '.join(CRM_FIELDS)}")
    response = session.get(url)
    response.raise_for_status()
    applications = response.json().get("value", [])
    crm_data = []
    for app in applications:
        app_id = app.get("new_jobapplicationid")
        createdon = app.get("createdon")
        if not (app_id and createdon):
            continue
        created_date = str(createdon).split('T')[0]
        # Fetch annotations for this application
        annotations_url = f"{CRM_URL}/annotations?$filter=_objectid_value eq {app_id}&$select=filename"
        ann_response = session.get(annotations_url)
        ann_response.raise_for_status()
        annotations = ann_response.json().get('value', [])
        for annotation in annotations:
            filename = annotation.get('filename')
            if filename:
                constructed_filename = f"{created_date}_{filename}"
                normalized = constructed_filename.replace(' ', '').lower()
                crm_data.append({
                    'normalized_filename': normalized,
                    **app
                })
    print(f"[CRM] Fetched {len(crm_data)} CRM records with annotation filenames.")
    return crm_data

# Update DB with CRM fields by normalized filename, only if currently NULL
def update_db_with_crm_fields(crm_data):
    with psycopg2.connect(**DB_CONFIG) as conn:
        with conn.cursor() as cur:
            updated = 0
            for item in crm_data:
                normalized_filename = item['normalized_filename']
                set_clauses = []
                values = []
                null_conditions = []
                for crm_field, db_col in CRM_TO_DB.items():
                    value = item.get(crm_field)
                    set_clauses.append(f"{db_col} = %s")
                    values.append(value)
                    null_conditions.append(f"{db_col} IS NULL")
                set_clause = ", ".join(set_clauses)
                null_condition = " OR ".join(null_conditions)
                values.append(normalized_filename)
                cur.execute(f"""
                    UPDATE pdf_extracted_data SET {set_clause}
                    WHERE ( {null_condition} )
                    AND LOWER(REPLACE(pdf_filename, ' ', '')) = %s
                """, values)
                if cur.rowcount > 0:
                    updated += cur.rowcount
            conn.commit()
    print(f"[CRM] Updated {updated} rows in pdf_extracted_data with extra CRM fields (only where NULL).")

# Run the process
print("[CRM] Adding extra CRM columns if missing...")
add_crm_columns()
print("[CRM] Fetching CRM data and annotation filenames...")
crm_data = fetch_crm_data()
print("[CRM] Updating database with CRM fields (only where NULL)...")
update_db_with_crm_fields(crm_data)
print("[CRM] Done.") 

[CRM] Adding extra CRM columns if missing...
[CRM] Ensured all extra CRM columns exist in the database.
[CRM] Fetching CRM data and annotation filenames...
[CRM] Fetching job applications from 2025-05-30 to 2025-06-02 with fields: new_jobapplicationid, new_fullname, new_contactphone, new_telephonenumber, new_jauid, new_jobofferstatus, new_gender, new_position, new_employmenttype, new_expectedsalary, new_dateavailableforemployment, new_currentsalary, new_company, new_graduationyear, new_qualitiesattributes, new_careergoals, new_additionalinformation, new_appstatus, new_hrinterviewstatus, new_technicalrating, new_technicalinterviewcomments, new_hrcomment, createdon, modifiedon, new_howdidyouhearaboutrowad, new_listouttheextrasocialactivities, new_pleasesepcify


  today = datetime.utcnow().date()


[CRM] Fetched 37 CRM records with annotation filenames.
[CRM] Updating database with CRM fields (only where NULL)...
[CRM] Updated 0 rows in pdf_extracted_data with extra CRM fields (only where NULL).
[CRM] Done.
