In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

## Website Data collector

In [14]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from dotenv import load_dotenv

# ------------------------
# Load env
# ------------------------
load_dotenv()

# ------------------------
# Config
# ------------------------
OUTPUT_FOLDER = "G:\\Github_Projects\\Ai_twin\\file\\CV_statement_details"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

URLS = {
    "PersonalWebsite": "https://marufmullah50.github.io/",
    "LinkedIn": "https://www.linkedin.com/in/marufmullah50"
}

MAX_PAGES = 30
MIN_TEXT_LENGTH = 200   # skip tiny junk pages

# ------------------------
# Text Cleaner
# ------------------------
def clean_text(text):
    lines = [l.strip() for l in text.splitlines()]
    lines = [l for l in lines if l]

    # remove obvious navigation/footer noise
    blacklist = ["menu", "navigation", "login", "sign in", "cookie", "privacy"]
    filtered = []

    for line in lines:
        low = line.lower()
        if not any(b in low for b in blacklist):
            filtered.append(line)

    return "\n".join(filtered)

# ------------------------
# Crawl whole site
# ------------------------
def scrape_site(base_url, max_pages=20):
    visited = set()
    to_visit = [base_url]
    domain = urlparse(base_url).netloc
    collected = []

    while to_visit and len(visited) < max_pages:
        url = to_visit.pop(0)

        if url in visited:
            continue

        try:
            resp = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
            resp.raise_for_status()

            soup = BeautifulSoup(resp.text, "html.parser")

            # remove script/style
            for tag in soup(["script", "style", "noscript"]):
                tag.decompose()

            raw_text = soup.get_text("\n")
            cleaned = clean_text(raw_text)

            if len(cleaned) > MIN_TEXT_LENGTH:
                collected.append(cleaned)
                print(f"Scraped: {url}")

            visited.add(url)

            # find internal links
            for a in soup.find_all("a", href=True):
                full = urljoin(base_url, a["href"]).split("#")[0]
                parsed = urlparse(full)

                if parsed.netloc == domain and full not in visited:
                    to_visit.append(full)

        except Exception as e:
            print(f"Failed: {url} -> {e}")

    # deduplicate pages
    unique_pages = list(dict.fromkeys(collected))
    return "\n\n".join(unique_pages)

# ------------------------
# Single page scrape (fallback)
# ------------------------
def scrape_single(url):
    resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    return clean_text(soup.get_text("\n"))

# ------------------------
# Run scraping
# ------------------------
scraped_data = {}

for label, url in URLS.items():
    try:
        if "github.io" in url:
            scraped_data[label] = scrape_site(url, MAX_PAGES)
        else:
            scraped_data[label] = scrape_single(url)

        print(f"Finished: {label}")

    except Exception as e:
        print(f"Failed scraping {url}: {e}")

Scraped: https://marufmullah50.github.io/
Scraped: https://marufmullah50.github.io/index.html
Scraped: https://marufmullah50.github.io/about.html
Scraped: https://marufmullah50.github.io/education.html
Scraped: https://marufmullah50.github.io/skills.html
Scraped: https://marufmullah50.github.io/research.html
Scraped: https://marufmullah50.github.io/projects.html
Scraped: https://marufmullah50.github.io/experience.html
Scraped: https://marufmullah50.github.io/contact.html
Finished: PersonalWebsite
Finished: LinkedIn


In [17]:
import requests
#requests.get("http://localhost:11434").content
#OLLAMA_BASE_URL = "http://localhost:11434/v1"
from openai import OpenAI
#ollama = OpenAI(base_url=OLLAMA_BASE_URL, api_key='ollama')
#MODEL_o = "llama3.2"


from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
MODEL = "gpt-5-nano"

In [18]:
def summarize_text(text, label):
    system_prompt = (
        "You are an expert AI assistant helping build a structured knowledge base "
        "about a person from their website content. Your task is to extract factual, "
        "biographical, academic, professional, and personality-related information "
        "so it can be used for an AI twin and RAG system."
    )

    user_prompt = f"""
Extract and organize **all relevant personal and professional information** from the following content taken from {label}.

Format the output in **Markdown** with clear sections:

## Biography / Background  
## Education  
## Research Interests  
## Technical Skills  
## Projects  
## Publications / Work Experience  
## Achievements  
## Goals / Future Plans  
## Personal Values or Motivations  
## Writing Style or Tone Clues  
## Contact / Links (if present)  
## Other Important Notes  

Rules:
- Keep only factual information found in the text  
- Do NOT invent anything  
- Prefer concise bullet points  
- Preserve first-person meaning if written that way  
- Ignore menus, navigation, and boilerplate website text  

CONTENT:
{text[:4000]}
"""

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )

    return response.choices[0].message.content


In [19]:
from IPython.display import Markdown, display
cleaned_data = {}
for label, text in scraped_data.items():
    summary = summarize_text(text, label)
    cleaned_data[label] = summary
    print(f"\n--- {label} Info (Markdown) ---\n")
    display(Markdown(summary))



--- PersonalWebsite Info (Markdown) ---



## Biography / Background
- Md. Maruf Mullah is a Mechanical Engineer and Researcher with a focus on bridging classical engineering and computational intelligence to solve problems in materials science, manufacturing, and robotics.
- I am a dedicated Mechanical Engineer and researcher with a strong focus on bridging the gap between classical engineering and computational intelligence.
- I aim to connect traditional engineering fundamentals with cutting-edge computational approaches.
- I work on data-driven modeling, machine learning/deep learning, and computational methods to predict material behavior and support engineering decisions.
- My work spans from developing YOLO-based computer vision systems for industrial safety and medical imaging to researching the densification of tropical wood species.
- I am open to collaboration, research discussions, and innovation-oriented projects.

## Education
- B.Sc. in Mechanical Engineering, Military Institute of Science and Technology (MIST), Dhaka, Bangladesh
  - Apr 2021 â€“ May 2025
  - CGPA: 3.23 / 4.00
- Higher Secondary Certificate (HSC), Abdul Kadir Mollah City College
  - 2020
  - GPA: 5.00 / 5.00
- Secondary School Certificate (SSC), Siraj Nagar M.A. Pilot High School
  - 2018
  - GPA: 5.00 / 5.00

## Research Interests
- Machine Learning & Deep Learning in Mechanical Engineering
- Materials Science, Metamaterials
- Autonomous Systems & Robotics
- Smart Manufacturing
- Renewable Energy Applications with Data-Driven Methods

## Technical Skills
- Programming / Tools: Python, MATLAB
- CAD / Design: SOLIDWORKS; CAD/CAM design
- Simulation / Analysis: ANSYS; Finite Element Analysis (FEA) awareness
- Additive Manufacturing / Prototyping: FDM 3D printing workflows
- Other: Experience in surface roughness prediction, autonomous/robotic system development; experience with image-based classification (e.g., YOLO-based computer vision)

## Projects
- Casting Defect Classification (reported 99.9% accuracy)
- Tailstock Tool Holder Design
- Surface roughness prediction using machine learning (mentioned in About Me)
- Freshwater fish image classification with MobileNetV2 (mentioned in About Me)
- Wind forecasting using ANN/CatBoost (mentioned in About Me)
- Machining tool design and fabrication (mentioned in About Me)

## Publications / Work Experience
- Publications / Thesis
  - Thesis: Densification of Natural Wood
  - Paper: Impact Strength of Seasoned Wood (ICMEAS 2025)
- Work Experience
  - Industrial experience at IFAD Autos PLC
  - Industrial experience at PRAN-RFL Group
- Leadership / Other Roles
  - Science Fair Champion
  - Scout Volunteer

## Achievements
- Science Fair Champion
- Casting Defect Classification project achieving 99.9% accuracy
- Leadership as a Scout Volunteer

## Goals / Future Plans
- Connect materials science, manufacturing methods, and intelligent computational tools to create efficient and practical engineering solutions.
- Engage in collaboration, research discussions, and innovation-oriented projects.

## Personal Values or Motivations
- Passionate about solving complex engineering problems using data-driven approaches.
- Committed to bridging classical engineering with computational intelligence.

## Writing Style or Tone Clues
- Self-descriptive, outcome-focused, and enthusiastic about interdisciplinary integration.
- Emphasizes collaboration, data-driven methods, and practical engineering solutions.
- Uses action-oriented language (e.g., developing systems, leading projects, applying ML/DL).

## Contact / Links (if present)
- The content references navigation items like "Get In Touch" and "Contact," but no explicit contact details (email, phone, or social links) are provided in the text provided.

## Other Important Notes
- The content includes two overlapping sections titled "About Me," revealing similar information in multiple places.
- Notable technical focus areas mentioned: YOLO-based computer vision for industrial safety/medical imaging, wood densification research, and a mix of traditional mechanical engineering with ML/DL approaches.


--- LinkedIn Info (Markdown) ---



## Biography / Background
- No content provided in the supplied content.

## Education
- No content provided in the supplied content.

## Research Interests
- No content provided in the supplied content.

## Technical Skills
- No content provided in the supplied content.

## Projects
- No content provided in the supplied content.

## Publications / Work Experience
- No content provided in the supplied content.

## Achievements
- No content provided in the supplied content.

## Goals / Future Plans
- No content provided in the supplied content.

## Personal Values or Motivations
- No content provided in the supplied content.

## Writing Style or Tone Clues
- No content provided in the supplied content.

## Contact / Links (if present)
- No content provided in the supplied content.

## Other Important Notes
- Content block is empty; cannot extract information.

In [20]:
# Save Personal Knowledge Base as MARKDOWN
import os

output_folder = "G:\\Github_Projects\\Ai_twin\\file\\CV_statement_details"
os.makedirs(output_folder, exist_ok=True)

md_path = os.path.join(output_folder, "Personal_Knowledge_Base.md")

with open(md_path, "w", encoding="utf-8") as f:
    f.write("# Personal Knowledge Base\n")
    f.write("Structured personal, academic, and professional information extracted for AI Twin RAG system.\n\n")

    for label, text in cleaned_data.items():
        # Heading per source file
        f.write(f"## Source: {label}\n\n")
        f.write(text.strip())
        f.write("\n\n---\n\n")  # separator

print(f"Personal knowledge file saved to: {md_path}")

Personal knowledge file saved to: G:\Github_Projects\Ai_twin\file\CV_statement_details\Personal_Knowledge_Base.md
