# Extraction Strategy for Unstructured (CV/Resume) -> Structured

In [1]:
# import Libraries
import fitz
import pdfplumber
from docx import Document
import os
import unicodedata
import re
import json
import requests

#### Extract text from PDF

In [2]:
def extract_text_from_pdf(pdf_path):
    """
    Hybrid extractor for CVs.
    Captures both clickable and plain-text URLs (including Streamlit, HuggingFace, etc.)
    Inserts only meaningful top-level links (GitHub, LinkedIn, portfolio/demo).
    """
    final_text = ""
    all_links = set()

    # Extract embedded hyperlinks
    with fitz.open(pdf_path) as doc:
        for page in doc:
            for link in page.get_links():
                uri = link.get("uri")
                if uri and uri.startswith("http"):
                    all_links.add(uri.strip())

    # Detect plain-text URLs
    url_pattern = re.compile(
        r'(https?://[^\s]+|www\.[^\s]+|\b[\w-]+\.(?:vercel|netlify|github|streamlit|huggingface|render|heroku|io|app|ai|com|org)\b[^\s]*)',
        re.IGNORECASE
    )

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages, start=1):
            text = page.extract_text() or ""

            # Extract any URLs written in text
            found_urls = url_pattern.findall(text)
            for u in found_urls:
                clean_url = u.strip(").,;:!?")
                all_links.add(clean_url)

            # Clean formatting junk
            text = re.sub(r"\(cid:\d+\)", "", text)
            text = re.sub(r"\s+", " ", text)

            # Insert top-level links (only once)
            if i == 1:
                top_links = []
                for link in all_links:
                    if any(
                        k in link.lower()
                        for k in [
                            "linkedin",
                            "github",
                            "portfolio",
                            "vercel",
                            "netlify",
                            "streamlit",
                            "huggingface",
                            "render",
                            "demo",
                            "live",
                            "project",
                        ]
                    ):
                        top_links.append(link)

                if top_links:
                    text = text.strip() + "\n\nLinks: " + ", ".join(sorted(top_links))

            final_text += f"\n\n--- Page {i} ---\n{text.strip()}"

    return final_text.strip()


pdf_text = extract_text_from_pdf("og_cv/afzal.pdf")
pdf_text

'--- Page 1 ---\nAFZAL A Data Scientist +91 7356922047 # afzalkottukkal23@gmail.com LinkedIn § GitHub Professional Summary Data Scientist and AI/ML Engineer with hands-on experience in statistical analysis, machine learning, deep learning, and big data engineering. Skilled in developing end-to-end data-driven solutions using tools and frameworks such as Python, TensorFlow, scikit-learn, FastAPI, and Apache Spark. Proficient in NLP, Computer Vision, and Transformer-based Large Language Models (LLMs) for advanced AI applications, including Generative AI (VAE, GAN). Experienced with SQL, BI tools, and cloud computing platforms for scalable data solutions. Strong understanding of microservices architecture, Hadoop, and Kafka for distributed processing and real-time analytics. Holds a Bachelor’s degree in Commerce (Accounts and Data Science, 2021–2024) and specialized in Data Science through Brototype, combining technical depth with domain knowledge to deliver innovative, business-driven ou

#### Extract text from DOCX

In [3]:
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

# Example
docx_text = extract_text_from_docx("cv/demo_cv.docx")
print(docx_text)

Curriculum Vitae
Afzal A
📍 Bangalore, India | 📧 afzal@email.com | 📱 +91-9876543210
🌐 www.afzala.com | 💼 LinkedIn: linkedin.com/in/afzala | 🐙 GitHub: github.com/afzala
Profile Summary
Data Scientist & Machine Learning Engineer with 3+ years of experience building end-to-end AI solutions. Specialized in retail analytics, demand forecasting, dynamic pricing, and inventory optimization. Strong background in Python, SQL, and cloud platforms (AWS, GCP). Passionate about applying AI to solve real-world business problems.
Skills
Programming & Tools: Python, R, SQL, Spark, TensorFlow, PyTorch, scikit-learn, Tableau, Power BI
Specializations: Retail Analytics, Machine Learning, NLP, Time Series Forecasting, Recommender Systems
Cloud & Deployment: AWS (SageMaker, Lambda), GCP (Vertex AI, BigQuery), Docker, Kubernetes, FastAPI
Soft Skills: Problem Solving, Team Collaboration, Communication, Analytical Thinking
Work Experience
Data Scientist – Walmart Labs, Bangalore
Jan 2022 – Present
- Built dema

In [4]:
# Extract function 
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    elif ext == ".txt":
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    else:
        raise ValueError("Unsupported file format: " + ext)
# Example usage
text = extract_text("og_cv/afzal.pdf")
print(text[:500])

--- Page 1 ---
AFZAL A Data Scientist +91 7356922047 # afzalkottukkal23@gmail.com LinkedIn § GitHub Professional Summary Data Scientist and AI/ML Engineer with hands-on experience in statistical analysis, machine learning, deep learning, and big data engineering. Skilled in developing end-to-end data-driven solutions using tools and frameworks such as Python, TensorFlow, scikit-learn, FastAPI, and Apache Spark. Proficient in NLP, Computer Vision, and Transformer-based Large Language Models (LLMs


In [5]:
# Clean
def clean_text(text: str) -> str:
    """Clean and normalize extracted CV text for LLM extraction."""
    
    # Normalize Unicode and typographic symbols
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\(cid:\d+\)", "", text)
    text = re.sub(r"â€“|â€”|–|—", "-", text)
    text = re.sub(r"[“”]", '"', text)
    text = re.sub(r"[‘’]", "'", text)
    text = re.sub(r"•+", "•", text)
    text = re.sub(r"---\s*Page\s*\d+\s*---", " ", text)

    # Replace known icons/emojis with labels
    replacements = {
        "📍": "Location:",
        "📧": "Email:",
        "📱": "Phone:",
        "🌐": "Website:",
        "💼": "LinkedIn:",
        "🐙": "GitHub:",
        "🏠": "Address:",
        "☎️": "Phone:",
        "✉️": "Email:",
    }
    for k, v in replacements.items():
        text = text.replace(k, v)

    # Normalize bullets, newlines, and separators
    text = re.sub(r'[\u2022\u25CF\u25A0•▪]', '-', text)  # normalize bullets
    text = re.sub(r'[-_]{3,}', ' ', text)
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)

    # Strip emojis or pictographs (catch-all)
    text = re.sub(r'[\U00010000-\U0010ffff]', ' ', text)  # remove all emojis

    # Remove stray non-ASCII junk but keep letters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # Normalize punctuation & spacing
    text = re.sub(r'\s([,.!?;:])', r'\1', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = text.strip()

    return text
# Example
cleaned_text = clean_text(text)
print(cleaned_text)
print(len(cleaned_text))

AFZAL A Data Scientist +91 7356922047 # afzalkottukkal23@gmail.com LinkedIn GitHub Professional Summary Data Scientist and AI/ML Engineer with hands-on experience in statistical analysis, machine learning, deep learning, and big data engineering. Skilled in developing end-to-end data-driven solutions using tools and frameworks such as Python, TensorFlow, scikit-learn, FastAPI, and Apache Spark. Proficient in NLP, Computer Vision, and Transformer-based Large Language Models (LLMs) for advanced AI applications, including Generative AI (VAE, GAN). Experienced with SQL, BI tools, and cloud computing platforms for scalable data solutions. Strong understanding of microservices architecture, Hadoop, and Kafka for distributed processing and real-time analytics. Holds a Bachelor's degree in Commerce (Accounts and Data Science, 2021-2024) and specialized in Data Science through Brototype, combining technical depth with domain knowledge to deliver innovative, business-driven outcomes. Technical S

In [6]:
# Cleaning step for cleaning empty list[] or "null", to make it None 
def clean_empty_lists_as_none(data):
    for key, value in data.items():
        if value is None:
            continue
        if isinstance(value, str) and value.strip().lower() == "null":
            data[key] = None
        elif isinstance(value, str) and value.strip() == "":
            data[key] = None
        elif isinstance(value, list):
            # Check if list is empty or contains only null/None
            if len(value) == 0:
                data[key] = None
            elif all((v is None) or (isinstance(v, str) and v.strip().lower() == "null") for v in value):
                data[key] = None
    return data

## Testing RAG Chunk Approach with Chunk embedding also

In [10]:
import re, json, requests, faiss, numpy as np
from sentence_transformers import SentenceTransformer
from my_secrets import API_KEY1

def extract_rag_info(text, api_key):
    """
    RAG-based CV info extractor using Gemini + FAISS retrieval.
    """
    # Define extraction targets
    queries = {
        "skills": "List all technical and professional skills mentioned in the resume.",
        "education": "List all education qualifications or academic programs.",
        "experience": "List all work roles or internships with organization names.",
        "projects": "List all projects or major works mentioned.",
        "certifications": "List all certifications or courses explicitly mentioned.",
        "achievements": "List all major achievements or accomplishments."
    }

    # Split text into chunks
    chunk_size = 600
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    # Embed chunks
    model = SentenceTransformer("all-MiniLM-L6-v2")
    chunk_embeddings = model.encode(chunks, convert_to_numpy=True)
    print("Chunks embedded")
    # Build FAISS index
    dim = chunk_embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(chunk_embeddings)
    print("Got index")
    # Retrieve relevant chunks per query
    retrieved = {}
    for key, query in queries.items():
        q_emb = model.encode([query], convert_to_numpy=True)
        top_k = 8 if key == "projects" else 2
        _, indices = index.search(q_emb, top_k)
        top_chunks = [chunks[i] for i in indices[0]]
        retrieved[key] = "\n".join(top_chunks)

    # Prepare the prompt
    combined_prompt = """
You are a fast, precise resume information extraction system. Your **ONLY** output must be a single JSON object.

**Required JSON Keys:** [skills, education, experience, projects, certifications, achievements]

**Extraction Rules (Minimal Factual Lists - STRICT JSON NULL/ARRAY):**
1. **skills:** List of skill names only (no sentences). Example: ["Python", "SQL", "FastAPI"].
2. **education, experience, certifications, achievements:** Each as a **list of short single-line items.**
   * education: Format "Degree/Program – Institution (Year Range)".
   * experience: Format "Role – Organization".
   * certifications: Format "Certification Name or Issuer".
   * achievements: Short factual result, under 10 words.
3. **projects:** List of objects `{"name": "...", "links": [...]}`.
   * Include only GitHub, demo, or portfolio links relevant to the project.
   * If no links, set "links": null.

**Output Format Rules:**
- STRICT JSON only.
- Missing list fields → `[]`
- Missing single-value fields → `null`
- Keep all values concise and factual.
"""

    # Attach retrieved context
    for key, context in retrieved.items():
        combined_prompt += f"\n\n### {key.upper()} SECTION CONTEXT ###\n{context}"
    print('Prompt completed')
    print(len(combined_prompt))
    # Call Gemini API
    api_url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
    headers = {"Content-Type": "application/json", "X-goog-api-key": api_key}
    payload = {"contents": [{"parts": [{"text": combined_prompt}]}]}

    try:
        response = requests.post(api_url, headers=headers, json=payload).json()
        raw_text = response["candidates"][0]["content"]["parts"][0]["text"]
        clean_json_text = re.sub(r"^```[a-zA-Z]*|```$", "", raw_text).strip()
        data = json.loads(clean_json_text)

        # Clean nulls and empty lists
        data = clean_empty_lists_as_none(data)
        return data

    except Exception as e:
        print(f"Error extracting sections: {e}")
        return {key: None for key in queries.keys()}

# ---------- Example Usage ----------
API_KEY = API_KEY1
info_dict = extract_rag_info(cleaned_text, API_KEY)
info_dict

  from .autonotebook import tqdm as notebook_tqdm


Chunks embedded
Got index
Prompt completed
12008


{'skills': ['Python',
  'TensorFlow',
  'PyTorch',
  'scikit-learn',
  'Pandas',
  'NumPy',
  'OpenCV',
  'Transformers (LLMs)',
  'NLP',
  'Computer Vision',
  'Generative AI (VAE, GAN)',
  'Apache Spark',
  'Hadoop',
  'Kafka',
  'AWS',
  'GCP',
  'Azure',
  'SQL',
  'PostgreSQL',
  'MySQL',
  'FastAPI',
  'Microservices',
  'Docker',
  'Kubernetes',
  'REST APIs',
  'JWT Authentication',
  'Matplotlib',
  'Seaborn',
  'Power BI',
  'Tableau',
  'System Design',
  'Data Structures and Algorithms',
  'Data Analytics'],
 'education': ['Brototype - Data Science and AI/ML Program (Aug. 2024 - Present)',
  "St. John's College, Anchal (University of Kerala) (Aug. 2021 - Apr. 2024)",
  'CPHSS Kadakkal (Jun. 2019 - May. 2021)'],
 'experience': None,
 'projects': [{'name': 'CVAlyze - AI-Powered ETL + CV Analysis Platform',
   'links': None},
  {'name': 'True Buddy Chatbot - Emotional Support AI',
   'links': ['https://article-researcher-app-cbngx5mmmxxdtjzfewngtn.streamlit.app/',
    'https:/

### When we use RAG Top k Chunks method, it take 10 seconds to completely extract skills, education,experience,projects(not completely),certifications,acheievements, For this, I have used all-mpnet-base-v2,multi-qa-MiniLM-L6-cos-v1,all-MiniLM-L6-v2. -- Performance is better than using locally installed LLMs

# Testing - RAG without making chunks

In [10]:
# Extract Resume Sections
from my_secrets import API_KEY2

def extract_info(text,api_key):

    combined_prompt = """
You are a fast, precise resume information extraction system. Your **ONLY** output must be a single JSON object.

**Required JSON Keys:** [name, profession, phone_number, email, location, github_link, linkedin_link, skills, education, experience, projects, certifications, achievements]

**Extraction Rules (Minimal Factual Lists - STRICT JSON NULL/ARRAY):**

1.  **Personal Info:** Return concise string value or **null** if missing. **location** must be the personal contact address **ONLY**. **IGNORE** all job/education/project locations.
    * ***github\_link and linkedin\_link:*** Search the entire text for URLs containing 'github.com' or 'linkedin.com/in/' and extract them. If multiple are found, use the first one. Use **null** if not found.
2.  **skills:** Extract all technical/professional skills as a **list of strings**. Use **[]** (empty list) if none found.
3.  **education, experience, certifications, achievements:** Return each as a **list of single-line strings**. **NO SUMMARIES/EXPLANATIONS.**
    * If a section is missing, return **[]** (empty list).
    * **education:** Format: [Degree/Program – Institution (Year Range)]. Include all programs/levels.
    * **experience:** Format: [Role – Organization].
    * **certifications:** Format: [Certification Title or Issuer]. ***Only include certifications EXPLICITLY listed as such.***
    * **achievements:** Format: **[Key Result, max 10 words]**. Summarize core action/outcome.
4.  **projects:** Return a **list of objects**. Use **[]** (empty list) if no projects.
    * Each object **must** adhere to: `{"name": "...", "links": [...]}`.
    * **The `links` list MUST only contain project-specific URLs (GitHub/demo).** **Exclude all other links.** If no project-specific link is found, the `links` key must be **null**.

**Final Mandate (For Maximum Speed and Precision):**
* Output is **STRICTLY JSON** (no preamble/postscript).
* **Missing list fields MUST use `[]`. Missing single-value fields MUST use `null`.**
* Keep all values **EXTREMELY CONCISE AND MINIMAL.**

**Text sections:**
"""
    # Personal info extraction text
    combined_prompt+=text
    print(len(combined_prompt))

    # ---------- Call Gemini API once ----------
    api_url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent"
    headers = {"Content-Type": "application/json", "X-goog-api-key": api_key}
    payload = {"contents": [{"parts": [{"text": combined_prompt}]}]}

    try:
        response = requests.post(api_url, headers=headers, json=payload).json()
        raw_text = response["candidates"][0]["content"]["parts"][0]["text"]
        clean_json_text = re.sub(r"^```[a-zA-Z]*|```$", "", raw_text).strip()
        data = json.loads(clean_json_text)
        data = clean_empty_lists_as_none(data)

        return data
        
    except Exception as e:
        print(f"Error extracting sections: {e}")
        return response

# ---------- Example Usage ----------
API_KEY = API_KEY2
info_dict = extract_info(cleaned_text, API_KEY)
info_dict

10963


{'name': 'AFZAL A',
 'profession': 'Data Scientist',
 'phone_number': '+91 7356922047',
 'email': 'afzalkottukkal23@gmail.com',
 'location': None,
 'github_link': 'https://github.com/me-Afzal/',
 'linkedin_link': 'https://linkedin.com/in/afzal-a-0b1962325',
 'skills': ['Python',
  'TensorFlow',
  'PyTorch',
  'scikit-learn',
  'Pandas',
  'NumPy',
  'OpenCV',
  'Transformers (LLMs)',
  'NLP',
  'Computer Vision',
  'Generative AI (VAE, GAN)',
  'Apache Spark',
  'Hadoop',
  'Kafka',
  'AWS',
  'GCP',
  'Azure',
  'SQL',
  'PostgreSQL',
  'MySQL',
  'Query Optimization',
  'Database Design',
  'FastAPI',
  'Microservices',
  'Docker',
  'Kubernetes',
  'REST APIs',
  'JWT Authentication',
  'Matplotlib',
  'Seaborn',
  'Power BI',
  'Tableau',
  'Data Storytelling',
  'System Design',
  'Data Structures and Algorithms',
  'RAG (Retrieval-Augmented Generation)',
  'LangChain',
  'Gemini API',
  'ESRGAN',
  'Streamlit',
  'MLOps',
  'CI/CD pipelines'],
 'education': ['Data Science and AI/

### It give response much faster than chunks setup and local LLm setup. It gave response within 4 seconds

# We created custom class of this final RAG setup for cv extraction and created a preprocess.py for storing data preprocessing functions.

In [1]:
from preprocess import extract_text,clean_text,get_gender,get_lat_lon
from rag_extractor import CvExtractor
import pandas as pd
source="og_cv/afzal.pdf"
cleaned_text=clean_text(extract_text(source))
extractor=CvExtractor()
info_dict=extractor.extract(cleaned_text)

df=pd.DataFrame([info_dict])
df[['latitude', 'longitude', 'country']]=df['location'].apply(lambda loc: pd.Series(get_lat_lon(loc)))
df['gender']=df['name'].apply(get_gender)
df.head()


Unnamed: 0,name,profession,phone_number,email,location,github_link,linkedin_link,skills,education,experience,projects,certifications,achievements,latitude,longitude,country,gender
0,AFZAL A,Data Scientist,+91 7356922047,afzalkottukkal23@gmail.com,,https://github.com/me-Afzal/,https://linkedin.com/in/afzal-a-0b1962325,"[Python, TensorFlow, PyTorch, scikit-learn, Pa...",[Brototype - Data Science and AI/ML Program (A...,,[{'name': 'CVAlyze - AI-Powered ETL + CV Analy...,,"[Deployed AI-powered apps using FastAPI, LangC...",20.5937,78.9629,India,male


In [None]:
import os
import pandas as pd
from preprocess import extract_text, clean_text, get_gender, get_lat_lon
from rag_extractor import CvExtractor

# Folder containing CV PDFs
cv_folder = "cv"

# Initialize extractor
extractor = CvExtractor()

# List to store each CV's info
all_cv_data = []

# Iterate over all files in the folder
for file_name in os.listdir(cv_folder):
    if not file_name.lower().endswith((".pdf", ".docx", ".txt")):
        continue  # skip other file types
    file_path = os.path.join(cv_folder, file_name)
    try:
        # Extract and clean text (extract_text handles pdf, docx, txt)
        text = clean_text(extract_text(file_path))
        
        # Extract CV info
        info_dict = extractor.extract(text)
        
        # Append to list
        all_cv_data.append(info_dict)
    
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

df=pd.DataFrame(all_cv_data)
df=df[['name', 'profession', 'phone_number', 'email', 'location', 'github_link', 'linkedin_link', 'skills',
       'education', 'experience', 'projects', 'certifications', 'achievements']]
df[['latitude', 'longitude', 'country']]=df['location'].apply(lambda loc: pd.Series(get_lat_lon(loc)))
df['gender']=df['name'].apply(get_gender)
df.head(10)

Unnamed: 0,name,profession,phone_number,email,location,github_link,linkedin_link,skills,education,experience,projects,certifications,achievements,latitude,longitude,country,gender
0,Afzal A,Data Scientist & Machine Learning Engineer,+91-9876543210,afzal@email.com,"Bangalore, India",https://github.com/afzala,https://linkedin.com/in/afzala,"[Python, R, SQL, Spark, TensorFlow, PyTorch, s...","[M.Tech, Data Science – Indian Institute of Te...","[Data Scientist – Walmart Labs, Bangalore, Mac...","[{'name': 'Retail Analytics Dashboard', 'links...","[AWS Certified Machine Learning - Specialty, G...",[Won Top 10 AI Innovation Award at Walmart Hac...,12.976794,77.590082,India,male
1,John Smith,Data Scientist,+1 123-456-7890,john.smith@example.com,"New York, USA",,,"[Machine Learning (Scikit-learn, PyTorch, Tens...",[M.Sc. in Data Science – New York University (...,"[Data Scientist – Walmart, Data Analyst – Target]",,"[AWS Certified Machine Learning Specialist, Go...",[Developed demand forecasting models that impr...,40.712728,-74.006015,United States,male
2,John Doe,Data Scientist,+91-9000000000,john@email.com,"New York, USA",https://github.com/john,https://linkedin.com/in/john,"[R, Tableau, Power BI, Statistics, NLP, Time S...","[B.Sc, Computer Science – New York University ...",[Data Scientist – Company ABC],"[{'name': 'Customer Segmentation', 'links': No...","[Google Cloud Professional Data Engineer, Adva...","[Presented at Data Science Summit 2022, Publis...",40.712728,-74.006015,United States,male
3,Ivy Walker,Data Scientist,+91-9000000009,ivy@email.com,Singapore,https://github.com/ivy,https://linkedin.com/in/ivy,"[R, Tableau, Power BI, Statistics, NLP, Time S...","[B.Sc, Computer Science – New York University ...",[Data Scientist – Company ABC],"[{'name': 'Customer Segmentation', 'links': No...","[Google Cloud Professional Data Engineer, Adva...",[Published research paper on AI-driven Pricing...,1.357107,103.819499,Singapore,female
4,Jane Smith,Data Scientist,+91-9000000001,jane@email.com,"Bangalore, India",https://github.com/jane,https://linkedin.com/in/jane,"[AWS, GCP, Azure, Python, SQL, Machine Learnin...","[MBA, Business Analytics – London Business Sch...",[Data Scientist – Company ABC],"[{'name': 'Retail Analytics Dashboard', 'links...","[Google Cloud Professional Data Engineer, Adva...",[Developed award-winning ML model for customer...,12.976794,77.590082,India,female
5,Alice Johnson,Data Scientist,+91-9000000002,alice@email.com,"Delhi, India",https://github.com/alice,https://linkedin.com/in/alice,"[AWS, GCP, Azure, NLP, Time Series Forecasting...","[M.Sc, Statistics – University of California (...",[Data Scientist – Company ABC],"[{'name': 'Fraud Detection System', 'links': N...","[Advanced SQL for Data Science - Coursera, Goo...",[Won Top 10 AI Innovation Award at Walmart Hac...,28.632803,77.219771,India,female
6,Bob Brown,Data Scientist,+91-9000000003,bob@email.com,"San Francisco, USA",https://github.com/bob,https://linkedin.com/in/bob,"[AWS, GCP, Azure, Deep Learning, TensorFlow, P...","[MBA, Business Analytics – London Business Sch...",[Data Scientist – Company ABC],"[{'name': 'Retail Analytics Dashboard', 'links...","[Tableau Desktop Specialist, Google Cloud Prof...",[Published research paper on AI-driven Pricing...,37.779259,-122.419329,United States,male
7,Charlie Davis,Data Scientist,+91-9000000004,charlie@email.com,"Mumbai, India",https://github.com/charlie,https://linkedin.com/in/charlie,"[Deep Learning, TensorFlow, PyTorch, Cloud Com...","[M.Sc, Statistics – University of California (...",[Data Scientist – Company ABC],"[{'name': 'E-commerce Chatbot', 'links': None}...","[AWS Certified Machine Learning - Specialty, T...",[Recognized as Employee of the Year 2021 at Co...,19.054999,72.869203,India,male
8,Eva Wilson,Data Scientist,+91-9000000005,eva@email.com,"London, UK",https://github.com/eva,https://linkedin.com/in/eva,"[Python, SQL, Machine Learning, Data Analysis,...","[B.Sc, Computer Science – New York University ...",[Data Scientist – Company ABC],"[{'name': 'Fraud Detection System', 'links': N...","[AWS Certified Machine Learning - Specialty, M...",[Developed award-winning ML model for customer...,51.489334,-0.144055,United Kingdom,female
9,Frank Miller,Data Scientist,+91-9000000006,frank@email.com,"Toronto, Canada",https://github.com/frank,https://linkedin.com/in/frank,"[Python, SQL, Machine Learning, Data Analysis,...","[MBA, Business Analytics – London Business Sch...",[Data Scientist – Company ABC],"[{'name': 'Dynamic Pricing Engine', 'links': N...","[Microsoft Azure AI Fundamentals, Google Cloud...","[Presented at Data Science Summit 2022, Publis...",43.653482,-79.383935,Canada,male
