# NPS Corporate

In [1]:
#import pandas as pd
import re
from IPython.display import Markdown, HTML, display, FileLink
from ThaiTextPrepKit import polars_pretextkit as preprocess, typo_patterns as TYPO
import polars as pl
import pythainlp

In [2]:
def highlight_patterns(patterns, text, html=True, highlight_color_replace="#FFFF00", highlight_color_match="#00FFFF"):
    """
    Highlights all occurrences of the given regex patterns in the text using HTML <span> tags with background color.

    :param patterns: A list of tuples containing regex patterns and their replacements.
    :param text: The text to search within.
    :param highlight_color_replace: The background color to use for replacements (default is yellow).
    :param highlight_color_match: The background color to use for matches without replacement (default is blue).
    :return: The text with highlighted matches.
    """
    def add_highlight(match, color):
        return f'<span style="background-color: {color};">{match.group(0)}</span>' if html else f'<typo>{match.group(0)}</typo>'
    
    highlighted_text = text
    for pattern, replacement in patterns:
        matches = list(re.finditer(pattern, text))
        replacement = replacement.lstrip('<IGNORE>').rstrip('</IGNORE>')
        if matches:
            for match in matches:
                if replacement in match.group(0):
                    highlighted_text = highlighted_text.replace(match.group(0), add_highlight(match, highlight_color_match))
                else:
                    highlighted_text = highlighted_text.replace(match.group(0), add_highlight(match, highlight_color_replace))
    
    return highlighted_text

In [3]:
patterns = TYPO.patterns

In [4]:
def get_highlight_texts(patterns, texts: list) -> None:
    html_text = []
    for text in texts:
        html_text.append(highlight_patterns(patterns, text))
    return html_text

In [5]:
text_cols = [
    'Text',
]

In [6]:
path = r"C:\Users\patom\OneDrive\Documents\Repo\DataPrepKit\Samples\(QC) Social_4.xlsx"
#path = "/Users/pa/Documents/GitHub Repository Clone/DataPrepKit/Samples/NPS Reason HY'24-textprocessing test1.xlsx"

df = pl.read_excel(path,
                   engine='calamine')

concat_df = df.with_columns(
    pl.concat_list(text_cols).alias('text')
)

In [7]:
concat_df = concat_df.explode('text')
concat_df = concat_df.drop_nulls('text')

In [8]:
concat_df.shape

(14304, 46)

In [9]:
original_texts = concat_df.get_column('text').to_list()

In [10]:
html_texts = get_highlight_texts(patterns, original_texts)

for text in html_texts[:5]:
    display(Markdown(text))

In [None]:
#skip_patterns = [re.compile(rf'(‡πÄ‡∏ö‡∏≠[‡∏£]*[‡πå]*‡πÇ‡∏ó[‡∏£]*[‡∏™‡∏®]‡∏±[‡∏û‡∏ö][‡∏ó]*[‡πå]*)|(‡πÄ‡∏ö‡∏≠[‡∏£‡πå]*‡πÇ‡∏ó[‡∏£]*)|(‡πÄ‡∏ö‡∏≠(?!‡∏∞)[‡∏£]*[‡πå]*)', re.IGNORECASE),]

In [None]:
series = pl.Series(original_texts)
pre_series = preprocess.preprocess_text_batches(series=series,
                                   custom_dict=None,
                                   keep_stopwords=True,
                                   keep_format=True,
                                   return_token_list=False,
                                   lower_case=False,
                                   include_pattern='[/()]',
                                   skip_patterns=None,
                                   patterns=patterns)

for t in pre_series[:5]:
    print(t)

‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô lotus ‡πÄ‡∏Ç‡πâ‡∏≤‡πÄ‡∏á‡∏∑‡πà‡∏≠‡∏ô‡πÑ‡∏Ç
‡∏ú‡∏π‡∏Å‡∏Å‡∏±‡∏ö‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô ‡πÅ‡∏Å‡∏£‡∏ö ‡πÅ‡∏•‡∏∞‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô lotus ‡πÄ‡∏Ç‡πâ‡∏≤‡πÄ‡∏á‡∏∑‡πà‡∏≠‡∏ô‡πÑ‡∏Ç cashback 5% ‡∏£‡∏∂‡πÑ‡∏°‡πà‡∏Ñ‡πà‡∏∞


In [None]:
def generate_html_table(*args):
    """
    Generate an HTML table with columns for each text list provided, using the given names as headers.

    Args:
    - *args (tuple): Variable number of tuples, each containing a list of texts and a corresponding name.

    Returns:
    - html_content (str): String containing the HTML table.
    """
    headers = [name for _, name in args]
    
    html_content = "<table border='1'>\n"
    html_content += "<tr>" + "".join(f"<th>{header}</th>" for header in headers) + "</tr>\n"
    
    # Find the maximum length among the provided lists to handle uneven lengths
    max_length = max(len(text_list) for text_list, _ in args)
    
    for i in range(max_length):
        html_content += "<tr>"
        for text_list, _ in args:
            cell_content = text_list[i] if i < len(text_list) else ""
            html_content += f"<td>{cell_content}</td>"
        html_content += "</tr>\n"

    html_content += "</table>"
    
    return html_content

In [None]:
pre_series.to_list()

['‡∏ó‡∏µ‡πà‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô‡∏Ç‡∏≠‡∏á AEON ‡∏à‡∏∞‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡πÄ‡∏£‡∏≤‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏Ñ‡∏£‡∏±‡∏ö‡∏ß‡πà‡∏≤‡∏à‡∏∞‡∏à‡πà‡∏≤‡∏¢‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡πà‡∏≥‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà',
 '‡∏™‡∏°‡∏±‡∏Ñ‡∏£ ‡∏ö‡∏±‡∏ï‡∏£‡πÄ‡∏Ñ‡∏£‡∏î‡∏¥‡∏ï ‡∏≠‡∏¥‡∏≠‡∏≠‡∏ô ‡∏ú‡πà‡∏≤‡∏ô‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô ‡∏£‡∏±‡∏ö‡∏ö‡∏±‡∏ï‡∏£‡∏™‡∏≤‡∏Ç‡∏≤‡πÑ‡∏î‡πâ‡πÑ‡∏´‡∏° ‡∏™‡∏°‡∏±‡∏Ñ‡∏£ ‡∏ö‡∏±‡∏ï‡∏£‡πÄ‡∏Ñ‡∏£‡∏î‡∏¥‡∏ï ‡∏≠‡∏¥‡∏≠‡∏≠‡∏ô ‡∏ú‡πà‡∏≤‡∏ô‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô‡∏Ç‡∏∂‡πâ‡∏ô‡∏ú‡πà‡∏≤‡∏ô‡πÅ‡∏•‡πâ‡∏ß ‡∏£‡∏±‡∏ö‡∏ö‡∏±‡∏ï‡∏£‡∏™‡∏≤‡∏Ç‡∏≤‡πÑ‡∏î‡πâ‡πÑ‡∏´‡∏°',
 '‡∏™‡∏°‡∏±‡∏Ñ‡∏£‡∏ö‡∏±‡∏ï‡∏£ Aeon ‡πÉ‡∏ô‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô ‡∏Ç‡∏∂‡πâ‡∏ô‡∏≠‡∏ô‡∏∏‡∏°‡∏±‡∏ï‡∏¥‡πÅ‡∏•‡πâ‡∏ß‡∏à‡∏∞‡πÑ‡∏î‡πâ‡∏ö‡∏±‡∏ï‡∏£‡∏†‡∏≤‡∏¢‡πÉ‡∏ô‡∏Å‡∏µ‡πà‡∏ß‡∏±‡∏ô‡∏Ñ‡∏£‡∏±‡∏ö ‡∏™‡∏°‡∏ö‡∏±‡∏ï‡∏£‡∏≠‡∏µ‡∏≠‡πâ‡∏≠‡∏ô‡∏ú‡πà‡∏≤‡∏ô‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô ‡πÅ‡∏•‡πâ‡∏ß‡∏Ç‡∏∂‡πâ‡∏ô‡∏ß‡πà‡∏≤‡∏ö‡∏±‡∏ï‡∏£‡∏≠‡∏ô‡∏∏‡∏°‡∏±‡∏ï‡∏¥‡πÅ‡∏•‡πâ‡∏ß ‡∏à‡∏∞‡πÑ‡∏î‡πâ‡∏ö‡∏±‡∏ï‡∏£‡∏´‡∏£‡∏∑‡∏≠‡πÄ‡∏•‡∏Ç‡∏ö‡∏±‡∏ï‡∏£‡∏†‡∏≤‡∏¢‡πÉ‡∏ô‡∏Å‡∏µ‡πà‡∏ß

In [None]:
# Generate HTML table
html_table = generate_html_table((html_texts, 'raw text'), 
                                 (get_highlight_texts(patterns, pre_series.to_list()), "PAREPA"),
                                 #(pn_raw_token_texts, 'pn engine detection'),
                                 #(pn_pre_token_texts, 'pn engine correction')
                                 )

# Display HTML in Jupyter Notebook
from IPython.display import HTML, display
#display(HTML(html_table))

In [None]:
# Create a downloadable link for the HTML content
with open('table_social_pre_text.html', 'w', encoding='utf-8') as f:
    f.write(html_table)

FileLink('table_social_pre_text.html')