In [None]:
!apt-get install tesseract-ocr -y
!pip install pytesseract Pillow opencv-python

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
from PIL import Image
import pytesseract
from google.colab import files
from typing import List, Tuple, Dict
import random

# --- Text Extraction ---
def extract_text(image_path: str) -> str:
    """Extract text from an image using OCR"""
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

# --- Diagram Type Detection ---
def infer_diagram_type(text: str) -> str:
    """Identify the diagram type based on keywords"""
    text_upper = text.upper()

    # Debug: Print extracted text to see what we're working with
   # print(f"DEBUG - Extracted text: {text_upper[:200]}...")

    # Check for specific diagram titles/headers first (most reliable)
    if "DIRECT ARRAY DECLARATION" in text_upper:
        return "DIRECT_ARRAY"
    elif "ARRAY DECLARATION" in text_upper and "DIRECT" not in text_upper:
        return "ARRAY_DECLARATION"
    elif "ARRAY REFERENCE" in text_upper:
        return "ARRAY_REFERENCE"

    # Check for Railroad-9 and 10 (specific patterns)
    elif "AWAITOPEN" in text_upper:
        return "RAILROAD_9"
    elif ("FOR" in text_upper and "DO" in text_upper and
          ("STEP" in text_upper or "UNTIL" in text_upper or "WHILE" in text_upper)):
        return "RAILROAD_10"

    # Check for other specific diagram patterns
    elif "DEFINE DECLARATION" in text_upper:
        return "DEFINE_DECLARATION"
    elif "DEFINE" in text_upper and "#" in text_upper:  # Define has # delimiter
        return "DEFINE_DECLARATION"
    elif "TRUTHSET" in text_upper:
        return "TRUTHSET"
    elif "FORMAT" in text_upper and ("IN" in text_upper or "OUT" in text_upper):
        return "FORMAT"

    # Check for simple type declarations (least specific, check last)
    elif "BOOLEAN" in text_upper and "ARRAY" not in text_upper:
        return "BOOLEAN"
    elif "DINTEGER" in text_upper and "ARRAY" not in text_upper:
        return "DINTEGER"

    # Fallback checks for DIRECT ARRAY if title wasn't caught
    elif ("DIRECT" in text_upper and "ARRAY" in text_upper and
          "REFERENCE" not in text_upper):
        return "DIRECT_ARRAY"

    else:
        return "UNKNOWN"

# --- Combined Syntax Generator ---
class RailroadSyntaxGenerator:
    def __init__(self):
        # Components for Railroad-9 and 10
        self.rr_components = {
            "RAILROAD_9": {
                "file_parts": ["datafile", "config", "logfile"],
                "designators": ["MAINFILE", "SECFILE"],
                "subfile_indices": ["1", "2", "3"],
                "options": ["PARTICIPATE TRUE", "CONNECTTIMELIMIT 30"]
            },
            "RAILROAD_10": {
                "variables": ["i", "j", "k"],
                "init_values": ["0", "IX", "IX + 7"],
                "steps": ["1", "2", "3"],
                "until_values": ["255", "LIM"],
                "conditions": ["NOT DONE", "TARGET LEQ RANGE"],
                "statements": ["PRINT(i)", "PROCESS(data)"]
            }
        }

    def generate_syntax(self, diagram_type: str) -> Tuple[str, List[str]]:
        """Generate syntax rules and examples based on diagram type"""
        if diagram_type == "RAILROAD_9":
            return self._generate_railroad_9()
        elif diagram_type == "ARRAY_REFERENCE":
            return self._generate_array_reference()
        elif diagram_type == "RAILROAD_10":
            return self._generate_railroad_10()
        elif diagram_type == "DINTEGER":
            return self._generate_dinteger()
        elif diagram_type == "DIRECT_ARRAY":
            return self._generate_direct_array()
        elif diagram_type == "TRUTHSET":
            return self._generate_truthset()
        elif diagram_type == "FORMAT":
            return self._generate_format()
        elif diagram_type == "BOOLEAN":
            return self._generate_boolean()
        elif diagram_type == "DEFINE_DECLARATION":
            return self._generate_define_declaration()
        elif diagram_type == "ARRAY_DECLARATION":
            return self._generate_array_declaration()
        else:
            return "Unknown diagram type", []

    def _generate_railroad_9(self) -> Tuple[str, List[str]]:
        syntax = (
            "AWAITOPEN(<file_part>) [→ <file_designator>]\n"
            "    [SUBFILE <subfile_index>]\n"
            "    [, <awaitopen_option>]*"
        )
        examples = [
            "AWAITOPEN(datafile) → MAINFILE",
            "AWAITOPEN(config) SUBFILE 1, PARTICIPATE TRUE"
        ]
        return syntax, examples

    def _generate_railroad_10(self) -> Tuple[str, List[str]]:
        syntax = (
            "FOR <variable> := <initial_part>\n"
            "    [<iteration_part>] DO <statement>\n\n"
            "initial_part ::= <arithmetic_expression>\n"
            "iteration_part ::= \n"
            "    STEP <step> UNTIL <until>\n"
            "    | WHILE <condition>\n"
            "    | UNTIL <until>"
        )
        examples = [
            "FOR i := 0 STEP 1 UNTIL 255 DO PRINT(i)",
            "FOR j := IX + 7 WHILE NOT DONE DO PROCESS(data)"
        ]
        return syntax, examples

    def _generate_dinteger(self) -> Tuple[str, List[str]]:
        syntax = "[PRIVATE OWN | PUBLIC OWN] DINTEGER <identifier> [, <identifier>]*"
        examples = [
            "DINTEGER A",
            "PRIVATE OWN DINTEGER A, B",
            "PUBLIC OWN DINTEGER A, B, C"
        ]
        return syntax, examples

    def _generate_direct_array(self) -> Tuple[str, List[str]]:
        syntax = (" DIRECT [OWN] ARRAY [<array class>] <identifier> [ [<bound pair list>] ]\n"
                  "        [, <identifier> [ [<bound pair list>] ] ]*\n"
                  "        [ <identifier> [<lower bound>] = <direct array row> ]")
        examples = [
            "DIRECT ARRAY A",
            "OWN DIRECT ARRAY MYCLASS A [1:10]",
            "A[1] = B[row]"
        ]
        return syntax, examples

    def _generate_truthset(self) -> Tuple[str, List[str]]:
        syntax = ("TRUTHSET <identifier> ( <membership expression> ) [, <identifier> ( <membership expression> )]*\n"
                  "membership expression ::= [NOT] <membership primary> [AND|OR|!|IMP|EQV <membership primary>]*\n"
                  "membership primary ::= <string literal> | <truth set identifier> | ( <membership expression> ) | ALPHA | ALPHA7 | ALPHA8")
        examples = [
            "TRUTHSET A ( ALPHA AND ALPHA7 )",
            "TRUTHSET B ( ! A )",
            "TRUTHSET X ( ( ALPHA OR ALPHA8 ) IMP B )"
        ]
        return syntax, examples

    def _generate_format(self) -> Tuple[str, List[str]]:
        syntax = (
            "FORMAT [IN | OUT] <identifier> ( <editing specifications> )\n"
            "editing specifications ::= <simple string literal> | <editing phrase> | <repeat part> ( <editing specifications> ) [, ...]\n"
            "editing phrase ::= G | O | A <field width> | H | K | ... | S <scale factor> | <editing modifier>\n"
            "repeat part ::= <unsigned integer> *"
        )
        examples = [
            "FORMAT IN FMT1 ( 'Hello' )",
            "FORMAT OUT FMT2 ( 2*('Hi', G, 3*H) )",
            "FORMAT FMT ( A5, /, V10.2 )"
        ]
        return syntax, examples

    def _generate_array_reference(self) -> Tuple[str, List[str]]:
        syntax = (
            "[PRIVATE | PUBLIC] DIRECT <array class> ARRAY REFERENCE <identifier> [ <lower bounds> ]\n"
            "[, <identifier> [ <lower bounds> ]]*\n\n"
            "<array class> ::= <word type> | <character type>\n"
            "<word type> ::= BOOLEAN | COMPLEX | DINTEGER | DOUBLE | INTEGER | REAL\n"
            "<character type> ::= ASCII | EBCDIC | HEX\n"
            "<lower bounds> ::= <arithmetic expression> [, <arithmetic expression>]*"
        )
        examples = [
            "DIRECT INTEGER ARRAY REFERENCE A[1,2]",
            "PUBLIC DIRECT BOOLEAN ARRAY REFERENCE X, Y[2]",
            "PRIVATE DIRECT ASCII ARRAY REFERENCE ARR[1,2,3]"
        ]
        return syntax, examples

    def _generate_boolean(self) -> Tuple[str, List[str]]:
        syntax = "[PRIVATE OWN | PUBLIC OWN] BOOLEAN <identifier> [ = <identifier> ] [, <identifier> [ = <identifier> ] ]*"
        examples = [
            "BOOLEAN A",
            "BOOLEAN A = B",
            "PRIVATE OWN BOOLEAN A, B = C",
            "PUBLIC OWN BOOLEAN X, Y = Z"
        ]
        return syntax, examples

    def _generate_define_declaration(self) -> Tuple[str, List[str]]:
        syntax = (
            "DEFINE <definition> [, <definition>]*\n\n"
            "<definition> ::= <identifier> [<formal symbol part>] = <text> #\n"
            "<formal symbol part> ::= ( <symbol> [, <symbol>]* )"
        )
        examples = [
            "DEFINE PI = 3.14159 #",
            "DEFINE ADD(x, y) = x + y #",
            "DEFINE MSG = 'Hello World' #",
            "DEFINE MAX(a, b) = a > b ? a : b #"
        ]
        return syntax, examples

    def _generate_array_declaration(self) -> Tuple[str, List[str]]:
        syntax = (
            "[PRIVATE | PUBLIC] [LONG] [OWN] <array class> ARRAY <identifier> [ [<bound pair list>] ]\n"
            "[, <identifier> [ [<bound pair list>] ] ]*\n"
            "[<array row equivalence>]\n\n"
            "<array class> ::= <word type> | <character type>\n"
            "<word type> ::= BOOLEAN | COMPLEX | DINTEGER | DOUBLE | INTEGER | REAL\n"
            "<character type> ::= ASCII | EBCDIC | HEX\n"
            "<bound pair list> ::= <bound pair> [, <bound pair>]*\n"
            "<bound pair> ::= <lower bound> : <upper bound>\n"
            "<lower bound> ::= <arithmetic expression>\n"
            "<upper bound> ::= <arithmetic expression>\n"
            "<array row equivalence> ::= <identifier> [ <lower bound> ] = <array row>"
        )
        examples = [
            "PRIVATE OWN INTEGER ARRAY A[1:10, 1:5]",
            "PUBLIC COMPLEX ARRAY B[0:9]",
            "LONG HEX ARRAY H[1:4]",
            "A[1] = ROWDATA"
        ]
        return syntax, examples

# --- Main Processing Function ---
def process_diagram(diagram_type: str) -> Tuple[str, List[str]]:
    generator = RailroadSyntaxGenerator()
    return generator.generate_syntax(diagram_type)

# --- Upload and Process Images ---
def main():
    uploaded = files.upload()

    for filename in uploaded.keys():
        print(f"\n========== Processing: {filename} ==========")
        raw_text = extract_text(filename)
        diagram_type = infer_diagram_type(raw_text)

        print(f"\n=== Inferred Diagram Type: {diagram_type.replace('_', ' ')} ===")

        syntax, examples = process_diagram(diagram_type)

        print("\n=== Generated Syntax ===")
        print(syntax)

        print("\n=== Example Statements ===")
        for example in examples:
            print(f"- {example}")

if __name__ == "__main__":
    main()

Saving rr10.png to rr10.png


=== Inferred Diagram Type: RAILROAD 10 ===

=== Generated Syntax ===
FOR <variable> := <initial_part>
    [<iteration_part>] DO <statement>

initial_part ::= <arithmetic_expression>
iteration_part ::= 
    STEP <step> UNTIL <until>
    | WHILE <condition>
    | UNTIL <until>

=== Example Statements ===
- FOR i := 0 STEP 1 UNTIL 255 DO PRINT(i)
- FOR j := IX + 7 WHILE NOT DONE DO PROCESS(data)
