The dataset will contain a particular product's background information, which can be obtained from the Components section of the Developer Portal, poorly written tutorials, annotations for correcting mistakes, and properly written tutorials. 
Then, each text component (page/document) will be pre-processed as a large string fed to the model. 

The pre-processing step includes using the Beautiful Soup library to parse and extract necessary blocks of text from each web page. There will be 2 functions (listed below) that will gather the text from the Components page and the tutorials. 

Birst will be the product selected to run a small version of InDevA.

# Installing & importing libraries

In [3]:
# Installing Beautiful Soup library
!pip install beautifulsoup4
!pip install beautifulsoup4 requests



In [4]:
!pip install --upgrade pyarrow



In [5]:
!pip install datasets



In [6]:
!pip install transformers torch
!pip install sentencepiece



In [3]:
# Import Beautiful Soup library
from bs4 import BeautifulSoup
import requests
# Import regex library
import re
# Import sentence tokenizer
from nltk.tokenize import sent_tokenize
# Import json library
import json
# Import libraries for model
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import datasets
from datasets import Dataset, load_dataset

# Data pre-processing 

In [5]:
# Pre-processing the Components page
def process_components(url):
    # Result string that stores extracted paragraphs and tables 
    result = ""
    
    # Send a GET request to the URL
    r = requests.get(url)

    # Check if the request was successful (status code == 200), otherwise print error message.
    if r.status_code == 200:
        # Parse HTML content
        soup = BeautifulSoup(r.content, 'html.parser')
        
        # Get all the tags on the page
        body = soup.find_all()

        # Iterate through each tag that is stored in body
        for tag in body:
            
            # Get the title of the application -> <h1>
            if tag.name == 'h1':
                title = tag.text.strip()
                
            # If the tag is a paragraph, then concat to the result string
            elif tag.name == 'p':
                result += tag.text
                result += ' '

            # If the tag is a table, then concat all the values in the table, except for headers
            elif tag.name == 'table':
                # Iterate through each row in the table
                for row in tag.find_all('tr'):
                    # Get all the columns in each row -> should be 2 columns, one for term and other for definition
                    cols = row.find_all('td')
                    # Stitch together the columns, so it follows this formatting -> vocab: definition
                    text = cols[0].text.strip() + ": " + cols[1].text.strip()

                    if text[-1] != '.':
                        text += '.'
                    text += ' '
                    
                    # Concat text to result string
                    result += text

        # Return the result string as a json string
        components_dict = {
            'application_title': title,
            'application_description': result,
            'tutorials': []
        }
        
        return json.dumps(components_dict)
    else: 
        return { }

In [6]:
# Pre-processing the Tutorials page
def process_tutorial(url):
    # Result string that stores extracted paragraphs and tables 
    result = ""
    
    # Send a GET request to the URL
    r = requests.get(url)

    # Check if the request was successful (status code == 200), otherwise print error message.
    if r.status_code == 200:
        # Parse HTML content
        soup = BeautifulSoup(r.content, 'html.parser')

        # print(soup.prettify())

        # Set variables for getting the business problem
        is_business_problem = False
        business_problem = ''

        # Set variables for getting the tutorial title and steps
        is_tutorial = False        
        tutorial_steps = ''
        
        # Set variables for getting requirements 
        is_requirements = False
        requirements = []
        
        # Get all the tags on the page
        body = soup.find_all()

        for tag in body:
            
            # Get tutorial title
            if tag.name == 'h1':
                title = tag.text

            # Business Problem and Tutorial sections have <h2> tag. So depending on the text associated with each tag, process accordingly. 
            elif tag.name == 'h2':

                # Tag == Business Problem, then set status as true
                if tag.text == 'Business Problem':
                    is_business_problem = True

                # Otherwise, set status as false and set the status for is_tutorial as true
                else:
                    is_business_problem = False
                    is_tutorial = True

            # Requirements and Components sections have <h3> tag. But only process the Requirements section and set status as true. 
            elif tag.name == 'h3':
                
                if tag.text == 'Requirements':
                    is_requirements = True
                else:
                    is_requirements = False

            elif tag.name == 'ul':

                # Requirements list is before Tutorials, so is_tutorial status must be false.
                if is_requirements and is_tutorial == False:
                    # Each requirement is listed as a <li> element
                    elements = tag.find_all('li')

                    # Incorporate data cleaning and add to requirements list
                    for e in elements:
                        requirements.append(e.text.strip() + '.')

                elif is_tutorial:
                    elements = tag.find_all('li')

                    for e in elements:
                        text = e.text
                            
                        eop = re.search('Why OS|Terms|Privacy Policy', text)

                        if not eop:
                            
                            if not text in tutorial_steps:
                                if text[-1] != '.':
                                    text += '. '  
                                    
                                tutorial_steps += text
                                tutorial_steps += ' '

            elif tag.name == 'ol':
                
                if is_tutorial:
                    steps = tag.find_all('li')

                    for s in steps:
                        
                        text = s.text
                        
                        if not text in tutorial_steps:
                            tutorial_steps += text
                            tutorial_steps += ' '
                        
            # When processing <p> tags, check statuses and store the text corresponding to whichever status is set to true
            elif tag.name == 'p':
                
                if is_business_problem:
                    business_problem += tag.text
                    business_problem += ' '

                elif is_tutorial:
                    # The text that appears are all warning boxes
                    text = tag.text
                    # Don't add Difficulty and Estimated completion time
                    diff = re.search('Difficulty:|Estimated Completion Time:', text)
                    
                    if not diff:
                        if not text in tutorial_steps:
                            tutorial_steps += text
                            tutorial_steps += ' '

        # Put together elements of json string
        business_problem = business_problem.strip()
        cleaned_tutorial_steps = sent_tokenize(tutorial_steps.strip())
            
        tutorial_dict = {
            'tutorial_title': title,
            'business_problem': business_problem,
            'requirements': requirements,
            'tutorial_steps': cleaned_tutorial_steps,
            'feedback': 'None.'
        }


        return json.dumps(tutorial_dict)
        
    else: 
        return {}

# Get all associated URLs

In [8]:
# Get all the links listed on the tutorials page and returns a list of them
def get_tutorial_urls(url):
    
    tutorials_url_lst = []
    
    # Send a GET request to the URL
    r = requests.get(url)

    # Check if the request was successful (status code == 200), otherwise print error message.
    if r.status_code == 200:
        
        # Parse HTML content
        soup = BeautifulSoup(r.content, 'html.parser')

        # Get all the <a class=...> tags (tutorial links)
        links = soup.find_all('a', class_='wp-block-pages-list__item__link')

        # Store the href part of the tag
        for l in links:
            tutorials_url_lst.append(l['href'])

    # Return the list
    return tutorials_url_lst

In [9]:
# Get all of the urls that will be used in the dataset
def get_urls(url_lst):
    new_lst = []

    # Go through each url in the list and check if it links to the Tutorials section 
    for u in url_lst:
        is_tutorial = re.search("tutorials", u)

        # If the url contains 'tutorials' in its address, then get all the tutorial links and add to list
        if is_tutorial:
            new_lst = new_lst + get_tutorial_urls(u)

        # Otherwise, just add the Components url
        else:
            new_lst.append(u)

    # Return list
    return new_lst

In [10]:
# Compile all the text from web pages to create the dataset.
def get_dataset(url_lst):

    # For every url in the list, check if it is a components or a tutorial page.
    for u in url_lst:
        
        is_components = re.search("components", u)
        is_tutorial = re.search("tutorials", u)

        # If it is a component, then call process_components
        if is_components:
            result_json = json.loads(process_components(u))

        # If it is a tutorial, then call process_tutorial
        if is_tutorial:
            tutorial_json = json.loads(process_tutorial(u))
            result_json['tutorials'].append(tutorial_json)

    return json.dumps(result_json)

# Assembling dataset

In [12]:
# URL to Tutorials page and Components page
url_lst = ['https://developer.infor.com/components/analytics/', 'https://developer.infor.com/tutorials/analytics/']

# Get all the URLs relating to the application 
# BIRST example: 9 tutorials and 1 components
master_lst = get_urls(url_lst)

# Build the json string 
master_dataset = json.loads(get_dataset(master_lst))

for tutorial in master_dataset['tutorials']:
    if tutorial['tutorial_title'] == 'Birst Cloud Agent Installation':
        tutorial['feedback'] = "The steps are easy to follow and understand. You must capitalize the word's first letter directly after a colon. The tone needs to remain formal. Refrain from using words, like we, ours, our, us, and let's."

    elif tutorial['tutorial_title'] == 'How to connect to a database':
        tutorial['feedback'] = "The steps are easy to follow and understand. You must capitalize the word's first letter directly after a colon. The tone needs to remain formal. Refrain from using words, like we, ours, our, us, and let's."

    elif tutorial['tutorial_title'] == 'How to connect to a file':
        tutorial['feedback'] = "The steps are easy to follow and understand. Fix the wording for the Visualizer description because 'Visualizer: Once you have created your model and processed the data once, you can update your data by uploading a new file and have Birst process it, and then take you directly to Visualizer to continue working on visualizations and reports.' sounds odd. The fixed sentence should be 'Visualizer: Once you have created your model and processed the data once, you can update your data by uploading a new file and have Birst process it. It then takes you directly to Visualizer to continue working on visualizations and reports.'. Fix the wording for the Dashboards description because 'Dashboards: Once you have created your model and processed the data once, you can update your data by uploading a new file and have Birst process it, and then take you directly to Dashboards to continue your work.' sounds odd. The fixed sentence should be 'Dashboards: Once you have created your model and processed the data once, you can update your data by uploading a new file and have Birst process it. It then takes you directly to Dashboards to continue your work.'." 

    elif tutorial['tutorial_title'] == 'Changing visualizations based on user selection':
        tutorial['feedback'] = "To main a professional tone, refrain from using words, like us or we."

    elif tutorial['tutorial_title'] == 'Dashboard filters of dynamically changing Measures and Dimensions':
        tutorial['feedback'] = "To improve clarity, explain what the variables do, such as the 'GetPromptValue' variable has a description to explain its function. Provide more information about the difference between a 'measure' and a 'dimension' and when to use them."

    elif tutorial['tutorial_title'] == 'Highlight the Performance on a Geographical Map':
        tutorial['feedback'] = "To main a professional tone, refrain from using words, like us or we. Provide more information for when to use a 'Professional' space and when to use an 'Enterprise' space. Also preface the tutorial by stating 'The tutorial is for a Professional space' instead of at the end."
        
    elif tutorial['tutorial_title'] == 'Dynamically change Time Series Types in Reports':
        tutorial['feedback'] = "Define 'trailing half year, trailing four months' for better understanding. Define 'Running Sum' and what it is used for. To main a professional tone, refrain from using words, like us or we."

    elif tutorial['tutorial_title'] == 'Keep Analytics Data Updated':
        tutorial['feedback'] = "To main a professional tone, refrain from using words, like us or we. New terms are introduced and defined nicely."
        
    elif tutorial['tutorial_title'] == 'Limit number of years/quarters/months visible in a filter':
        tutorial['feedback'] = "The business problem needs to be defined better because it is too vague. Best practice tips are helpful and make the tutorial easier to understand. When a feature has different options, explain what the options are so that the user knows which to pick."
        
json.dumps(master_dataset)




In [13]:
master_dataset['tutorials']

[{'tutorial_title': 'Birst Cloud Agent Installation',
  'business_problem': 'You wish to use cloud analytics for insights on your on-premise data but don’t know how to connect them. Birst Cloud Agent (Birst) simplifies the process of transferring, accessing, and integrating data into a cloud-based analytics platform. Not only is the process more secure, reliable, and efficient, but Birst also eliminates the need for manual data extraction and manipulation. Birst serves as the bridge between on-premises data sources and the Birst cloud environment, which ensures that your data is up-to-date and accessible for accurate reporting and analysis in real-time. This way, you and your organization can make data-driven decisions quicker, improve operational efficiency, and gain valuable insights from your data across different systems and databases.',
  'requirements': ['Internet Connection.',
   'Birst Analytics Platform Account Credentials.',
   'Birst Cloud Agent.',
   'Data Sources.',
   'Ja

# Setting up and running the model

In [21]:
def preprocess_pre_data(examples):
    
    # print(examples)
    
    # Combine title and description for input text
    inputs = [
        f"{title} - {description}" 
        for title, description in zip(examples["application_title"], examples["application_description"])
    ]

    # print(inputs)

    # # Tokenize the inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Create labels for unsupervised pre-training
    model_inputs["labels"] = model_inputs["input_ids"]

    return model_inputs


In [22]:
!pip install accelerate -U



In [None]:
# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small', legacy=False)
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Prepare the pre-training dataset
pretraining_data = [
    {
    'application_title': master_dataset['application_title'],
    'application_description': master_dataset['application_description']
    }
]


pre_dataset = Dataset.from_list(pretraining_data)
tokenized_pre_dataset = pre_dataset.map(preprocess_pre_data, batched=True)

# Prepare the training dataset
training_data = []

    # output_text = tutorial['feedback']
    # training_data.append((input_text, output_text))


# Run the model (pre-train)
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_pre_dataset,
)

trainer.train()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
