In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd()))
if project_root not in sys.path:
    sys.path.append(project_root)

print("Project Root:", project_root)

import os
from ocr.ocr import extract_text_from_image
from nlp.preprocess import clean_text

folder_path = "data/gate_qp"

image_files = [f for f in os.listdir(folder_path) 
               if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

all_raw_texts = []
all_clean_texts = []

for img_name in image_files:
    img_path = os.path.join(folder_path, img_name)
    print(f"Processing: {img_name}")
    
    raw_text = extract_text_from_image(img_path)
    all_raw_texts.append(raw_text)
    
    cleaned = clean_text(raw_text)
    all_clean_texts.append(cleaned)

final_corpus = "\n\n".join(all_clean_texts)

print(final_corpus[:5000])  


Project Root: c:\learnova_pyq
Processing: gate2017.jpg
Processing: gate2018.jpg
Processing: gate2019.jpg
Processing: gate2020.jpg
Processing: gate2021.jpg
Processing: gate2022.jpg
Processing: gate2023.jpg
Processing: gate2024.jpg
~ N w [NAT] [GATE-2014:2M] In an ideal Brayton cycle, atmospheric air (ratio of Le} specific heats, â€”-=1.4 , specific heat at constant cy pressure = 1.005 kJ/kg-K) at 1 bar and 300 K is compressed to 8 bar. The maximum temperature in the cycle is limited to 1280 K. If the heat is supplied at the rate of 80 MW, the mass flow rate (in kg/s) of air required in the cycle is [MCQ] [GATE-2014:1M] The thermal efficiency of an air-standard Brayton cycle in terms of pressure ratio rp and 1 Â®) is given by v (@) 1-5 ) 1-4 Â» D () 1- ay (@ 1- a [MCQ] [GATE-2014:1M] For a gas turbine power plant, identify the correct pair of statements. P. Smaller in size compared to steam power plant for same power output Q. Starts quickly compared to steam power plant R. Works on the 

In [2]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cthaw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nlp.preprocess import clean_text

cleaned_text = clean_text(final_corpus)
print(cleaned_text[:500])


~ N w [NAT] [GATE-2014:2M] In an ideal Brayton cycle, atmospheric air (ratio of Le} specific heats, â€”-=1.4 , specific heat at constant cy pressure = 1.005 kJ/kg-K) at 1 bar and 300 K is compressed to 8 bar. The maximum temperature in the cycle is limited to 1280 K. If the heat is supplied at the rate of 80 MW, the mass flow rate (in kg/s) of air required in the cycle is [MCQ] [GATE-2014:1M] The thermal efficiency of an air-standard Brayton cycle in terms of pressure ratio rp and 1 Â®) is given by


In [4]:
from nlp.topic_extraction import extract_topics_rake

phrases = extract_topics_rake(cleaned_text)

phrases = [p.strip() for p in phrases if p and p.strip() and len(p.strip()) > 3]

phrases[:20]


['stroke direct injection diesel pa ic aiconitant ppecca â„¢ Â©',
 'engine generates bek consuming 103 kee',
 'spark ignition engine follows air standard otto cycle',
 'efficiencies fi es mee via oh',
 '4n insulated mixing chamber located upstream',
 'ideal intercooled brayton perfect intercooing cycle',
 'constant tnelh oy pote specific heats',
 '918 superheated r â€” 134a p',
 '5 g water vapour per kg dry air',
 'qng7 pn tem ee one',
 '30 x 10 Â° kels',
 'exhaust proc heat rejection process',
 'Â© tera ibl 9',
 'ideal vapour compression refrigeration cycles',
 'parameters fica ne ie',
 'hemi go aa wee',
 '23007 common data questions',
 'rankine cycle ie te 14',
 'capa soicieivy refrigeration cycle',
 'least one isothermal process']

In [None]:
from analysis.semantic_clustering import cluster_topics_semantic

# Run SBERT-based clustering
clusters = cluster_topics_semantic(
    phrases,
    distance_threshold=1.2,   
    min_cluster_size=2       
)

print(f"\nTotal clusters found: {len(clusters)}")

for cid, topics in clusters.items():
    print(f"\n Cluster {cid} (size = {len(topics)})")
    for t in topics:
        print("   -", t)


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:00<00:00,  3.40it/s]



ðŸ”¥ Total clusters found: 16

ðŸ”µ Cluster 9 (size = 2)
   - spark ignition engine follows air standard otto cycle
   - air standard diesel cycle

ðŸ”µ Cluster 6 (size = 2)
   - efficiencies fi es mee via oh
   - parameters fica ne ie

ðŸ”µ Cluster 7 (size = 2)
   - 4n insulated mixing chamber located upstream
   - exit duct mixing chamber assuming

ðŸ”µ Cluster 0 (size = 4)
   - ideal intercooled brayton perfect intercooing cycle
   - rankine cycle ie te 14
   - old efficienc dual cycles
   - â€˜ esel cycle consists

ðŸ”µ Cluster 4 (size = 3)
   - constant tnelh oy pote specific heats
   - least one isothermal process
   - constant volume heat addition process

ðŸ”µ Cluster 11 (size = 2)
   - 918 superheated r â€” 134a p
   - saturated r â€” 134a

ðŸ”µ Cluster 2 (size = 5)
   - 5 g water vapour per kg dry air
   - superheated water vapour flowing
   - ay kg dry air stream
   - water vapor per kg
   - water vapour per kg

ðŸ”µ Cluster 5 (size = 2)
   - qng7 pn tem ee one
   - 5 apps 

In [None]:
from analysis.frequency_analysis import compute_topic_frequency

#(topic_phrase, raw_frequency_count, weighted_score)
ranked = compute_topic_frequency(phrases, top_n=10)
ranked


[('two decimal places ).', 4, 8.0),
 ('stroke direct injection diesel pa ic aiconitant ppecca â„¢ Â©', 1, 2.0),
 ('engine generates bek consuming 103 kee', 1, 2.0),
 ('spark ignition engine follows air standard otto cycle', 1, 2.0),
 ('efficiencies fi es mee via oh', 1, 2.0),
 ('4n insulated mixing chamber located upstream', 1, 2.0),
 ('ideal intercooled brayton perfect intercooing cycle', 1, 2.0),
 ('constant tnelh oy pote specific heats', 1, 2.0),
 ('918 superheated r â€” 134a p', 1, 2.0),
 ('5 g water vapour per kg dry air', 1, 2.0)]

In [None]:
import json, os

topics = []
for i, (phrase, freq, weight) in enumerate(ranked, start=1):
    phrase = phrase.strip()
    if not phrase:
        continue

    if freq >= 2:
        priority = "high"
    elif freq == 1:
        priority = "medium"
    else:
        priority = "low"

    topics.append({
        "id": i,
        "text": phrase,
        "freq": int(freq),
        "weight": float(weight),
        "priority": priority,
        "years": [] 
    })

# Build clusters list 
cluster_list = []
for cid, topic_list in clusters.items():
    cluster_list.append({
        "id": int(cid),
        "label": f"Cluster {cid}",
        "freq": len(topic_list),
        "tags": [],          
        "phrases": topic_list,
    })

data = {
    "topics": topics,
    "clusters": cluster_list,
    "questions": questions,
}

#Save JSON into your React app's public folder ---
out_path = os.path.join("pyq-radar", "public", "pyq-data.json")
os.makedirs(os.path.dirname(out_path), exist_ok=True)

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Saved to:", os.path.abspath(out_path))


Saved to: c:\learnova_pyq\pyq-radar\public\pyq-data.json
