In [None]:
import re
import ipywidgets as widgets
from IPython.display import display
from google.colab import files
import io
from typing import List, Dict, Optional


class RISConverter:
    """
    A class to convert bibliographic text entries to RIS (Research Information Systems) format.
    """

    def __init__(self):
        self.ris_records = []

    def filter_valid_lines(self, text: str) -> List[str]:
        """
        Filter lines that start with more than two Latin uppercase characters.

        Args:
            text (str): Multi-line input text

        Returns:
            List[str]: List of valid lines
        """
        lines = text.strip().split('\n')
        valid_lines = []

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Check if line starts with more than 2 Latin uppercase characters
            match = re.match(r'^([A-Z]{3,})', line)
            if match:
                valid_lines.append(line)

        return valid_lines

    def extract_title(self, line: str) -> Optional[str]:
        """
        Extract title from the bibliographic line.
        Handles dual titles and author separation.
        Single '/' separates title from authors.
        Double '//' separates authors from journal.

        Args:
            line (str): Bibliographic line

        Returns:
            Optional[str]: Extracted title
        """
        # First, find where the initial author citation ends (after "et al." or similar patterns)
        author_pattern = r'^[A-Z]+,\s+[A-Z]\.(?:\s+et\s+al\.)?'
        match = re.match(author_pattern, line)

        if not match:
            return None

        # Get text after initial author citation
        remaining_text = line[match.end():].strip()

        # Find the title - it's between the initial author and the last single '/'
        # The title ends before the last single '/' which starts the detailed author list

        # First, split by '//' to separate from journal information
        before_journal = remaining_text.split('//')[0]

        # Now find single '/' characters that separate title from detailed authors
        # We need to find the last single '/' that's not part of '//'
        single_slash_positions = []
        i = 0
        while i < len(before_journal):
            if before_journal[i] == '/':
                # Check if it's a single '/' (not part of '//')
                if i == 0 or before_journal[i-1] != '/':
                    if i == len(before_journal) - 1 or before_journal[i+1] != '/':
                        single_slash_positions.append(i)
            i += 1

        if single_slash_positions:
            # Title is everything before the last single '/'
            last_slash_pos = single_slash_positions[-1]
            title_part = before_journal[:last_slash_pos].strip()
        else:
            # If no single '/', the entire text before '//' is the title
            title_part = before_journal.strip()

        # Handle dual titles separated by "; "
        if '; ' in title_part:
            titles = title_part.split('; ')
            # Keep only the Latin (English) version - typically the second one
            if len(titles) > 1:
                # Check which one contains more Latin characters
                latin_title = None
                for title in titles:
                    if re.search(r'[a-zA-Z]', title):
                        latin_chars = len(re.findall(r'[a-zA-Z]', title))
                        cyrillic_chars = len(re.findall(r'[а-яё]', title, re.IGNORECASE))
                        if latin_chars > cyrillic_chars:
                            latin_title = title.strip()
                            break

                if latin_title:
                    title_part = latin_title
                else:
                    # Default to second title if can't determine
                    title_part = titles[-1].strip()

        return title_part if title_part else None

    def extract_authors(self, line: str) -> List[str]:
        """
        Extract author names from the bibliographic line.
        Authors are between the last single '/' and the double '//' (journal separator).

        Args:
            line (str): Bibliographic line

        Returns:
            List[str]: List of author names
        """
        authors = []

        # Find the author section - it's after the last single '/' and before '//'
        # First, split by '//' to get the part before journal
        before_journal = line.split('//')[0]

        # Find single '/' characters that separate title from detailed authors
        single_slash_positions = []
        i = 0
        while i < len(before_journal):
            if before_journal[i] == '/':
                # Check if it's a single '/' (not part of '//')
                if i == 0 or before_journal[i-1] != '/':
                    if i == len(before_journal) - 1 or before_journal[i+1] != '/':
                        single_slash_positions.append(i)
            i += 1

        if single_slash_positions:
            # Authors are after the last single '/'
            last_slash_pos = single_slash_positions[-1]
            author_part = before_journal[last_slash_pos + 1:].strip()
        else:
            # If no single '/', no detailed author section found
            return authors

        # Parse individual authors separated by commas
        author_names = [name.strip() for name in author_part.split(',')]

        # Clean up author names and format them properly
        for name in author_names:
            name = name.strip()
            if name and len(name) > 1:  # Avoid single characters
                # Remove trailing periods
                name = name.rstrip('.')
                authors.append(name)

        return authors

    def extract_journal(self, line: str) -> Optional[str]:
        """
        Extract journal name from between // and volume information.

        Args:
            line (str): Bibliographic line

        Returns:
            Optional[str]: Journal name
        """
        # Find text between // and the first comma after it (which usually precedes volume)
        journal_match = re.search(r'//\s*([^,]+)', line)
        if journal_match:
            return journal_match.group(1).strip()
        return None

    def extract_volume(self, line: str) -> Optional[str]:
        """
        Extract volume number.

        Args:
            line (str): Bibliographic line

        Returns:
            Optional[str]: Volume number
        """
        # Look for volume after journal name (typically a number after comma)
        volume_match = re.search(r'//[^,]+,\s*(\d+)', line)
        if volume_match:
            return volume_match.group(1)
        return None

    def extract_year(self, line: str) -> Optional[str]:
        """
        Extract publication year.

        Args:
            line (str): Bibliographic line

        Returns:
            Optional[str]: Publication year
        """
        # Look for 4-digit year
        year_match = re.search(r'\b(19|20)\d{2}\b', line)
        if year_match:
            return year_match.group(0)
        return None

    def extract_issue(self, line: str) -> Optional[str]:
        """
        Extract issue number (typically after N or No).

        Args:
            line (str): Bibliographic line

        Returns:
            Optional[str]: Issue number
        """
        # Look for N followed by number
        issue_match = re.search(r'N\s*(\d+)', line)
        if issue_match:
            return issue_match.group(1)
        return None

    def extract_issn(self, line: str) -> Optional[str]:
        """
        Extract ISSN number.

        Args:
            line (str): Bibliographic line

        Returns:
            Optional[str]: ISSN number
        """
        # Look for ISSN pattern
        issn_match = re.search(r'ISSN\s+([\d-]+)', line)
        if issn_match:
            return issn_match.group(1)
        return None

    def extract_pages(self, line: str) -> Optional[str]:
        """
        Extract page range (usually after с.).

        Args:
            line (str): Bibliographic line

        Returns:
            Optional[str]: Page range
        """
        # Look for pages after с. (Cyrillic 'c' for 'страница')
        pages_match = re.search(r'с\.\s*([\d-]+)', line)
        if pages_match:
            return pages_match.group(1)
        return None

    def parse_line_to_ris(self, line: str) -> Dict[str, any]:
        """
        Parse a single bibliographic line into RIS fields.

        Args:
            line (str): Bibliographic line

        Returns:
            Dict[str, any]: Dictionary containing RIS fields
        """
        ris_record = {
            'TY': 'JOUR',  # Always journal article
            'TI': self.extract_title(line),
            'AU': self.extract_authors(line),
            'JO': self.extract_journal(line),
            'VL': self.extract_volume(line),
            'PY': self.extract_year(line),
            'IS': self.extract_issue(line),
            'SN': self.extract_issn(line),
            'PG': self.extract_pages(line)
        }

        return ris_record

    def format_ris_record(self, record: Dict[str, any]) -> str:
        """
        Format a single RIS record as a string.

        Args:
            record (Dict[str, any]): RIS record dictionary

        Returns:
            str: Formatted RIS record
        """
        ris_lines = []

        # Type of reference (always first)
        ris_lines.append(f"TY  - {record['TY']}")

        # Title
        if record['TI']:
            ris_lines.append(f"TI  - {record['TI']}")

        # Authors (each on separate line)
        if record['AU']:
            for author in record['AU']:
                if author:
                    ris_lines.append(f"AU  - {author}")

        # Journal
        if record['JO']:
            ris_lines.append(f"JO  - {record['JO']}")

        # Volume
        if record['VL']:
            ris_lines.append(f"VL  - {record['VL']}")

        # Year
        if record['PY']:
            ris_lines.append(f"PY  - {record['PY']}")

        # Issue
        if record['IS']:
            ris_lines.append(f"IS  - {record['IS']}")

        # ISSN
        if record['SN']:
            ris_lines.append(f"SN  - {record['SN']}")

        # Pages - Use SP/EP if a range is found, otherwise just SP
        if record['PG']:
            pages = record['PG']
            if '-' in pages:
                try:
                    start_page, end_page = pages.split('-')
                    if start_page.strip():
                         ris_lines.append(f"SP  - {start_page.strip()}")
                    if end_page.strip():
                         ris_lines.append(f"EP  - {end_page.strip()}")
                except ValueError:
                    # If splitting fails, treat the whole string as a single start page
                    ris_lines.append(f"SP  - {pages.strip()}")
            else:
                # If no hyphen, treat the whole string as a single start page
                ris_lines.append(f"SP  - {pages.strip()}")
        # End of record
        ris_lines.append("ER  - ")
        return '\n'.join(ris_lines)

    def process_text_to_ris(self, input_text: str) -> str:
        """
        Main function to process input text and generate RIS format.

        Args:
            input_text (str): Multi-line bibliographic text

        Returns:
            str: Complete RIS formatted output
        """
        if not input_text.strip():
            return ""

        # Step 1: Filter valid lines
        valid_lines = self.filter_valid_lines(input_text)

        if not valid_lines:
            return "No valid bibliographic entries found. Please ensure entries start with 3+ uppercase letters."

        # Step 2: Parse each line and convert to RIS
        ris_output = []

        for line in valid_lines:
            record = self.parse_line_to_ris(line)
            formatted_record = self.format_ris_record(record)
            ris_output.append(formatted_record)

        return '\n\n'.join(ris_output)


class ColabRISInterface:
    """
    Interactive Google Colab interface for RIS conversion.
    """

    def __init__(self):
        self.converter = RISConverter()
        self.ris_output = ""
        self.setup_interface()

    def setup_interface(self):
        """
        Set up the interactive widgets for Google Colab.
        """
        # Title
        title_html = widgets.HTML("""
        <div style="text-align: center; margin-bottom: 20px;">
            <h2 style="color: #1f77b4;">📚 Bibliographic Text to RIS Converter</h2>
            <p style="color: #666;">Paste your bibliographic text below and convert it to RIS format</p>
        </div>
        """)

        # Input text area
        self.text_input = widgets.Textarea(
            value='',
            placeholder='Paste your bibliographic text here...\n\nExample:\nZAYKOVA, K. et al. Antibiotic prescribing practices...',
            description='Input Text:',
            layout=widgets.Layout(width='100%', height='200px'),
            style={'description_width': 'initial'}
        )

        # Buttons
        self.convert_button = widgets.Button(
            description='🔄 Convert to RIS',
            button_style='info',
            layout=widgets.Layout(width='200px', margin='10px 5px')
        )

        self.download_button = widgets.Button(
            description='📥 Download RIS File',
            button_style='success',
            layout=widgets.Layout(width='200px', margin='10px 5px'),
            disabled=True
        )

        self.clear_button = widgets.Button(
            description='🗑️ Clear All',
            button_style='warning',
            layout=widgets.Layout(width='200px', margin='10px 5px')
        )

        # Button box
        button_box = widgets.HBox([
            self.convert_button,
            self.download_button,
            self.clear_button
        ], layout=widgets.Layout(justify_content='center'))

        # Output area
        self.output_area = widgets.Output(
            layout=widgets.Layout(
                width='100%',
                height='400px',
                border='1px solid #ddd',
                margin='10px 0px',
                padding='10px'
            )
        )

        # Status label
        self.status_label = widgets.HTML(
            value='<p style="color: #666; text-align: center;">Ready to convert bibliographic text to RIS format</p>'
        )

        # Bind events
        self.convert_button.on_click(self.on_convert_click)
        self.download_button.on_click(self.on_download_click)
        self.clear_button.on_click(self.on_clear_click)

        # Layout
        self.interface = widgets.VBox([
            title_html,
            self.text_input,
            button_box,
            self.status_label,
            self.output_area
        ])

    def on_convert_click(self, button):
        """
        Handle convert button click.
        """
        with self.output_area:
            self.output_area.clear_output()

            input_text = self.text_input.value.strip()

            if not input_text:
                self.status_label.value = '<p style="color: #d9534f;">⚠️ Paste some text first</p>'
                return

            self.status_label.value = '<p style="color: #5bc0de;">🔄 Converting to RIS format...</p>'

            try:
                # Convert to RIS
                self.ris_output = self.converter.process_text_to_ris(input_text)

                if not self.ris_output:
                    self.status_label.value = '<p style="color: #d9534f;">❌ No valid entries found</p>'
                    self.download_button.disabled = True
                    return

                # Count records
                record_count = self.ris_output.count('TY  - JOUR')

                # Display output
                print("Generated RIS Output:")
                print("=" * 60)
                print(self.ris_output)
                print("=" * 60)
                print(f"Total records converted: {record_count}")

                # Update status and enable download
                self.status_label.value = f'<p style="color: #5cb85c;">✅ Successfully converted {record_count} record(s) to RIS format</p>'
                self.download_button.disabled = False

            except Exception as e:
                self.status_label.value = f'<p style="color: #d9534f;">❌ Error: {str(e)}</p>'
                self.download_button.disabled = True
                print(f"Error during conversion: {str(e)}")

    def on_download_click(self, button):
        """
        Handle download button click.
        """
        if not self.ris_output:
            self.status_label.value = '<p style="color: #d9534f;">⚠️ No RIS data to download. Please convert first.</p>'
            return

        try:
            # Create file content
            filename = 'bibliography.ris'

            # Use Google Colab's files.download()
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(self.ris_output)

            files.download(filename)

            self.status_label.value = '<p style="color: #5cb85c;">📥 RIS file downloaded successfully!</p>'

        except Exception as e:
            self.status_label.value = f'<p style="color: #d9534f;">❌ Download error: {str(e)}</p>'
            print(f"Download error: {str(e)}")

    def on_clear_click(self, button):
        """
        Handle clear button click.
        """
        self.text_input.value = ''
        self.ris_output = ''
        self.download_button.disabled = True
        self.output_area.clear_output()
        self.status_label.value = '<p style="color: #666; text-align: center;">Ready to convert bibliographic text to RIS format</p>'

    def display(self):
        """
        Display the interface.
        """
        display(self.interface)


# Instructions for Google Colab users
def show_instructions():
    """
    Display usage instructions.
    """
    instructions_html = widgets.HTML("""
    <div style="background-color: #f8f9fa; padding: 20px; border-radius: 10px; margin: 20px 0;">
        <h3 style="color: #495057; margin-top: 0;">📋 Instructions</h3>
        <ol style="color: #6c757d; line-height: 1.6;">
            <li><strong>Paste your bibliographic text</strong> into the text area above</li>
            <li>The text should contain entries starting with <strong>3+ uppercase letters</strong> (e.g., "ZAYKOVA, K. et al...")</li>
            <li>Click <strong>"Convert to RIS"</strong> to process the text</li>
            <li>Review the generated RIS output in the area below</li>
            <li>Click <strong>"Download RIS File"</strong> to save the result</li>
        </ol>

        <h4 style="color: #495057; margin-top: 20px;">✨ Features</h4>
        <ul style="color: #6c757d; line-height: 1.6;">
            <li>Automatically handles dual-language titles (keeps English version)</li>
            <li>Extracts authors, journal names, volumes, years, issues, ISSN, and page numbers</li>
            <li>Generates proper RIS format compatible with reference managers</li>
            <li>Real-time processing with error handling</li>
        </ul>
    </div>
    """)
    display(instructions_html)


# Main function to run in Google Colab
def create_ris_converter():
    """
    Create and display the RIS converter interface for Google Colab.
    """
    print("🚀 Initializing RIS Converter for Google Colab...")

    # Show instructions
    show_instructions()

    # Create and display interface
    interface = ColabRISInterface()
    interface.display()

    return interface


# Auto-run when imported
if __name__ == "__main__":
    converter_interface = create_ris_converter()
else:
    # For Google Colab, create the interface when the cell is run
    print("📚 RIS Converter loaded! Run create_ris_converter() to start.")

🚀 Initializing RIS Converter for Google Colab...


HTML(value='\n    <div style="background-color: #f8f9fa; padding: 20px; border-radius: 10px; margin: 20px 0;">…

VBox(children=(HTML(value='\n        <div style="text-align: center; margin-bottom: 20px;">\n            <h2 s…

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>