In [1]:
import gradio as gr
import pandas as pd
import jinja2
import pdfkit
import torch
import transformers
import warnings

from intel_npu_acceleration_library import NPUModelForCausalLM, int4
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, pipeline

In [2]:
# Step 1: Load CSV database
def load_csv(file):
    df = pd.read_csv(file.name)
    return df

# Step 1.1
def convert_binary_columns(df):
    for col in df.columns:
        if set(df[col].dropna().unique()) <= {0, 1}:  # Check if column is binary (only 0s and 1s)
            df[col] = df[col].map({1: 'Yes', 0: 'No'})  # Convert to Yes/No
    return df

#Step 1.2
def generate_data_summaries(df):
    # Numerical summary (after binary conversion)
    numerical_summary = df.describe().to_string()

    # Categorical summary
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    categorical_summary = {col: df[col].value_counts().to_string() for col in categorical_columns}
    
    return numerical_summary, categorical_summary


In [3]:
#Step 2: Option NPU - Set up local LLM model (using huggingface pipeline)
model_id = "microsoft/Phi-3-mini-128k-instruct"
model = NPUModelForCausalLM.from_pretrained(model_id, use_cache=True, dtype=int4).eval()
tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
#tokenizer.pad_token_id = tokenizer.eos_token_id
#prefix = tokenizer(messages, return_tensors="pt")["input_ids"]

generation_kwargs = dict(
    #input_ids=prefix,
    streamer=streamer,
    do_sample=True,
    top_k=50,
    top_p=0.9,
)

# Step 3: Create the HF pipe
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=2048,streamer=streamer)

  return torch.load(model_path)


In [4]:
#Step 4.1: Generate final user prompt
def generate_response(user_prompt, numerical_summary, categorical_summary):
    # Prepare the input for the model
    messages = [
    {
        "role": "system",
        "content": """You are an expert business and data analyst. Using the details provided in the numerical summary and categorical summary, generate a detailed business analysis report based on the user's question.
    Your report should include a report title, introduction, methodology, description of the data provided by user, findings and insights, detailed and elaborate analysis supported by analytical evidence, recommendations and a closing summary.
    Make sure the response is focused and avoid unnecessary repetition. Your response should be exclusively provided in English language.
    End your response once you have provided a closing summary. Notes or disclaimers are not required.
    Provide your report in a HTML document format following the rules as listed below:
        - <h2> tags For headers and <h4> tags for sub-headers
        - <ul> tags for unordered listings and <ol> for ordered listings
        - <b> tag for bolding of text
        - Follow all the other HTML syntax
    """
    },
    
    {
        "role": "user",
        "content": f"""Numerical Summary: {numerical_summary}
        Categorical Summary: {categorical_summary}
        Generate a business report based on the following question
        
        Question: {user_prompt}"""
    }]

    return messages

#Step 4.2 Function to generate report from the DataFrame
def generate_report_from_dataframe(user_prompt, file):
    # Load the CSV filepath
    df = pd.read_csv(file.name)
    
    # Step to convert binary columns to Yes/No
    df = convert_binary_columns(df)

    # Generate summaries
    numerical_summary, categorical_summary = generate_data_summaries(df)
    categorical_summary = "\n".join(f"{key}:\n{value}" for key, value in categorical_summary.items())

    # Generate the report using the LLM
    messages = generate_response(user_prompt, numerical_summary, categorical_summary)
    output = pipe(messages, **generation_kwargs)

    if isinstance(output, list) and 'generated_text' in output[0]:
        for message in output[0]['generated_text']:
            if message.get('role') == 'assistant':
                report = message.get('content', '').strip()
                break
    
    return report

In [5]:
# Step 5: Function to convert html report output to pdf 
def convert_html_to_pdf(
    html_string: str,
    template_folder: str = "./templates",
    base_html_file: str = "base.html",
    output_file: str = "generate_pdf.pdf",
    output_folder: str = "./generated_pdf_files",
) -> str:

    """
    Asynchronously converts an HTML file to a PDF file.

    This function takes the path of an HTML template file and generates a PDF file from it. The HTML file is expected to be located in the specified 'template_folder'. The generated PDF is saved with the name provided in 'output_file' and is stored in the specified 'output_folder'.

    Parameters:
    template_folder (str, optional): The path to the folder containing the HTML template. Defaults to "./templates".
    base_html_file (str, optional): The name of the base HTML file within the template folder. Defaults to "base.html".
    output_file (str, optional): The name of the output PDF file. Defaults to "generate_pdf.pdf".
    output_folder (str, optional): The path to the folder where the generated PDF file will be saved. Defaults to "../generated_pdf_files".

    Returns:
    str: The path to the generated PDF file.

    Note:
    - This function is asynchronous and should be awaited upon calling.
    - Ensure that the specified folders and files exist and are accessible.
    - The function might raise exceptions related to file reading/writing or PDF generation which should be handled appropriately.
    """

    if output_folder.endswith("/"):
        raise ValueError("Wrong output folder name, should not end with '/'")
    else:
        pdf_file_name = f"{output_folder}/{output_file}"

    try:
        template_loader = jinja2.FileSystemLoader(template_folder)
        template_env = jinja2.Environment(loader=template_loader)

        basic_template = template_env.get_template(base_html_file)

        output_html_code = basic_template.render()
        # print(output_html_code)

        # render content, this if for once we have AI generated response
        output_html_code = basic_template.render(
            ai_generated_content=html_string
        )

        options = {
            'page-size': 'A4',
            'margin-top': '0.75in',
            'margin-bottom': '0.75in',
            'margin-right': '0.55in',
            'margin-left': '0.55in',
            'encoding': "UTF-8",
            'footer-right': '[page] of [topage]',
            'footer-font-size': "9",
            'custom-header': [
                ('Accept-Encoding', 'gzip')
            ],
            'enable-local-file-access': False,
            'no-outline': None,
            'enable-local-file-access': False,
            'no-outline': None
        }

        config = pdfkit.configuration(wkhtmltopdf="C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe")

        pdfkit.from_string(
            input=output_html_code,
            output_path=pdf_file_name,
            options=options,
            configuration=config
        )

    except Exception as e:
        # good to log this exception instead
        print(e)
        return ""

    return f"PDF generated successfully at {pdf_file_name}"

In [None]:
with gr.Blocks(theme=gr.Theme.from_hub('HaleyCH/HaleyCH_Theme'), css=".column-form .wrap {flex-direction: column;}") as app:
    with gr.Row():
        gr.Markdown("""<h1><center>Report Generator</center>""")

    with gr.Row():
        with gr.Column(visible=True, min_width=350, scale=0) as sidebar:
            with gr.Row():
                # CSV file uploader
                csv_file_input = gr.File(label="Upload your CSV file", file_types=[".csv"])


            with gr.Row():
                # Textbox for user prompt
                user_prompt_input = gr.Textbox(
                    label="Enter focus area for analysis",
                    placeholder="e.g., What are the 3 main reasons for customer churn?\nPress `Enter` to generate report",
                    lines=1
                )

            with gr.Row():
                generate_pdf = gr.Button(
                    value="Press to convert report to PDF"
                )
        
        with gr.Column() as main:
            with gr.Row():
                # Output area for the generated report
                output_html = gr.HTML(label="Generated Report")
    
    # Set up the action to happen when the button is clicked
    user_prompt_input.submit(
        fn=generate_report_from_dataframe,
        inputs=[user_prompt_input, csv_file_input],
        outputs=output_html
        #outputs=textbox
    )

    # Convert HTML to PDF and display info message
    generate_pdf.click(
        fn=lambda html_content: (
        convert_html_to_pdf(html_content),
        gr.Info(convert_html_to_pdf(html_content))
        ),
        inputs=output_html,
        outputs=None
    )    

# Launch the Gradio app
app.launch(share=True, debug=True)

Running on local URL:  http://127.0.0.1:7861

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


2024/10/07 17:59:05 [W] [service.go:132] login to server failed: dial tcp 44.237.78.176:7000: i/o timeout


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Title: Main Reasons for Customer Churn and Related Insights

Executive Summary:

The objective of this report is to analyze and understand the three main reasons for customer churn within our business or any relevant context. For the analysis, we will examine provided categorical and numerical data related to various business aspects such as card ownership, demographics, payment history, etc. This will enable us to identify patterns and correlations which can be translated into insights to combat customer churn.

Step one: Organize and prepare data
-------------------------
To begin, we shall merge, sort, and clean the data to prepare it for analysis, removing any outliers or duplicates that may bias our results.

Step two: Numerical and Categorical Data Examination
---------------------------
We will examine the numerical (age, balance, payment history, etc.) and categorical (country, gender, card type, etc.) data. This examination will involve creating visual tools like histogoks (fo