The dataset will contain a particular product's background information, which can be obtained from the Components section of the Developer Portal, poorly written tutorials, annotations for correcting mistakes, and properly written tutorials. 
Then, each text component (page/document) will be pre-processed as a large string fed to the model. 

The pre-processing step includes using the Beautiful Soup library to parse and extract necessary blocks of text from each web page. There will be 2 functions (listed below) that will gather the text from the Components page and the tutorials. 

Birst will be the product selected to run a small version of InDevA.

# Installing & importing libraries

In [3]:
# Installing Beautiful Soup library
!pip install beautifulsoup4
!pip install beautifulsoup4 requests

!pip install pyarrow

!pip install datasets

!pip install transformers torch
!pip install sentencepiece

!pip install --upgrade pyarrow
!pip install --upgrade datasets
!pip install --upgrade transformers

!pip install transformers[torch]

!pip install tensorboard

!pip install tensorflow

!pip install accelerate -U
!pip install tf-keras

Collecting transformers
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.7 kB ? eta -:--:--
     ----------------- -------------------- 20.5/43.7 kB 640.0 kB/s eta 0:00:01
     -------------------------------------- 43.7/43.7 kB 711.0 kB/s eta 0:00:00
Downloading transformers-4.44.0-py3-none-any.whl (9.5 MB)
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ---------------------------------------- 0.1/9.5 MB 3.3 MB/s eta 0:00:03
   -- ------------------------------------- 0.5/9.5 MB 6.4 MB/s eta 0:00:02
   ---- ----------------------------------- 1.0/9.5 MB 8.0 MB/s eta 0:00:02
   ------ --------------------------------- 1.5/9.5 MB 8.5 MB/s eta 0:00:01
   -------- ------------------------------- 2.0/9.5 MB 9.0 MB/s eta 0:00:01
   ---------- ----------------------------- 2.6/9.5 MB 9.6 MB/s eta 0:00:01
   ------------ --------------------------- 3.0/9.5 MB 9.5 MB/s eta 0:00:01
   --------------

In [4]:
!pip install evaluate



In [5]:
!pip install rouge_score



In [6]:
!pip install vaderSentiment



In [7]:
!pip install tk



In [8]:
# Import Beautiful Soup library
from bs4 import BeautifulSoup
import requests
# Import regex library
import re
# Import sentence tokenizer
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
# Import json library
import json
# Import libraries for model
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import datasets
from datasets import Dataset, load_dataset, DatasetDict
from evaluate import load
from sklearn.model_selection import train_test_split
# Import libraries for visualizing training process
import tensorflow as tf
from torch.utils.tensorboard import SummaryWriter
# Sentiment analysis 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# User interface
import tkinter as tk
from tkinter import messagebox

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mhan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!





# Data pre-processing 

In [10]:
# Pre-processing the Components page
def process_components(url):
    # Result string that stores extracted paragraphs and tables 
    result = ""
    
    # Send a GET request to the URL
    r = requests.get(url)

    # Check if the request was successful (status code == 200), otherwise print error message.
    if r.status_code == 200:
        # Parse HTML content
        soup = BeautifulSoup(r.content, 'html.parser')
        
        # Get all the tags on the page
        body = soup.find_all()

        # Iterate through each tag that is stored in body
        for tag in body:
            
            # Get the title of the application -> <h1>
            if tag.name == 'h1':
                title = tag.text.strip()
                
            # If the tag is a paragraph, then concat to the result string
            elif tag.name == 'p':
                result += tag.text
                result += ' '

            # If the tag is a table, then concat all the values in the table, except for headers
            elif tag.name == 'table':
                # Iterate through each row in the table
                for row in tag.find_all('tr'):
                    # Get all the columns in each row -> should be 2 columns, one for term and other for definition
                    cols = row.find_all('td')
                    # Stitch together the columns, so it follows this formatting -> vocab: definition
                    text = cols[0].text.strip() + ": " + cols[1].text.strip()

                    if text[-1] != '.':
                        text += '.'
                    text += ' '
                    
                    # Concat text to result string
                    result += text

        # Return the result string as a json string
        components_dict = {
            'application_title': title,
            'application_description': result,
            'tutorials': []
        }
        
        return json.dumps(components_dict)
    else: 
        return { }

In [11]:
# Pre-processing the Tutorials page
def process_tutorial(url):
    # Result string that stores extracted paragraphs and tables 
    result = ""
    
    # Send a GET request to the URL
    r = requests.get(url)

    # Check if the request was successful (status code == 200), otherwise print error message.
    if r.status_code == 200:
        # Parse HTML content
        soup = BeautifulSoup(r.content, 'html.parser')

        # print(soup.prettify())

        # Set variables for getting the business problem
        is_business_problem = False
        business_problem = ''

        # Set variables for getting the tutorial title and steps
        is_tutorial = False        
        tutorial_steps = ''
        
        # Set variables for getting requirements 
        is_requirements = False
        requirements = []
        
        # Get all the tags on the page
        body = soup.find_all()

        for tag in body:
            
            # Get tutorial title
            if tag.name == 'h1':
                title = tag.text

            # Business Problem and Tutorial sections have <h2> tag. So depending on the text associated with each tag, process accordingly. 
            elif tag.name == 'h2':

                # Tag == Business Problem, then set status as true
                if tag.text == 'Business Problem':
                    is_business_problem = True

                # Otherwise, set status as false and set the status for is_tutorial as true
                else:
                    is_business_problem = False
                    is_tutorial = True

            # Requirements and Components sections have <h3> tag. But only process the Requirements section and set status as true. 
            elif tag.name == 'h3':
                
                if tag.text == 'Requirements':
                    is_requirements = True
                else:
                    is_requirements = False

            elif tag.name == 'ul':

                # Requirements list is before Tutorials, so is_tutorial status must be false.
                if is_requirements and is_tutorial == False:
                    # Each requirement is listed as a <li> element
                    elements = tag.find_all('li')

                    # Incorporate data cleaning and add to requirements list
                    for e in elements:
                        requirements.append(e.text.strip() + '.')

                elif is_tutorial:
                    elements = tag.find_all('li')

                    for e in elements:
                        text = e.text
                            
                        eop = re.search('Why OS|Terms|Privacy Policy', text)

                        if not eop:
                            
                            if not text in tutorial_steps:
                                if text[-1] != '.':
                                    text += '. '  
                                    
                                tutorial_steps += text
                                tutorial_steps += ' '

            elif tag.name == 'ol':
                
                if is_tutorial:
                    steps = tag.find_all('li')

                    for s in steps:
                        
                        text = s.text
                        
                        if not text in tutorial_steps:
                            tutorial_steps += text
                            tutorial_steps += ' '
                        
            # When processing <p> tags, check statuses and store the text corresponding to whichever status is set to true
            elif tag.name == 'p':
                
                if is_business_problem:
                    business_problem += tag.text
                    business_problem += ' '

                elif is_tutorial:
                    # The text that appears are all warning boxes
                    text = tag.text
                    # Don't add Difficulty and Estimated completion time
                    diff = re.search('Difficulty:|Estimated Completion Time:', text)
                    
                    if not diff:
                        if not text in tutorial_steps:
                            tutorial_steps += text
                            tutorial_steps += ' '

        # Put together elements of json string
        business_problem = business_problem.strip()
        cleaned_tutorial_steps = sent_tokenize(tutorial_steps.strip())
            
        tutorial_dict = {
            'tutorial_title': title,
            'business_problem': business_problem,
            'requirements': requirements,
            'tutorial_steps': cleaned_tutorial_steps,
            'feedback': 'None.'
        }


        return json.dumps(tutorial_dict)
        
    else: 
        return {}

# Get all associated URLs

In [13]:
# Get all the links listed on the tutorials page and returns a list of them
def get_tutorial_urls(url):
    
    tutorials_url_lst = []
    
    # Send a GET request to the URL
    r = requests.get(url)

    # Check if the request was successful (status code == 200), otherwise print error message.
    if r.status_code == 200:
        
        # Parse HTML content
        soup = BeautifulSoup(r.content, 'html.parser')

        # Get all the <a class=...> tags (tutorial links)
        links = soup.find_all('a', class_='wp-block-pages-list__item__link')

        # Store the href part of the tag
        for l in links:
            tutorials_url_lst.append(l['href'])

    # Return the list
    return tutorials_url_lst

In [14]:
# Get all of the urls that will be used in the dataset
def get_urls(url_lst):
    new_lst = []

    # Go through each url in the list and check if it links to the Tutorials section 
    for u in url_lst:
        is_tutorial = re.search("tutorials", u)

        # If the url contains 'tutorials' in its address, then get all the tutorial links and add to list
        if is_tutorial:
            new_lst = new_lst + get_tutorial_urls(u)

        # Otherwise, just add the Components url
        else:
            new_lst.append(u)

    # Return list
    return new_lst

In [15]:
# Compile all the text from web pages to create the dataset.
def get_dataset(url_lst):

    # For every url in the list, check if it is a components or a tutorial page.
    for u in url_lst:
        
        is_components = re.search("components", u)
        is_tutorial = re.search("tutorials", u)

        # If it is a component, then call process_components
        if is_components:
            result_json = json.loads(process_components(u))

        # If it is a tutorial, then call process_tutorial
        if is_tutorial:
            tutorial_json = json.loads(process_tutorial(u))
            result_json['tutorials'].append(tutorial_json)

    return json.dumps(result_json)

# Assembling dataset

In [17]:
# URL to Tutorials page and Components page
url_lst = ['https://developer.infor.com/components/analytics/', 'https://developer.infor.com/tutorials/analytics/']

# Get all the URLs relating to the application 
# BIRST example: 9 tutorials and 1 components
master_lst = get_urls(url_lst)

# Build the json string 
master_dataset = json.loads(get_dataset(master_lst))

for tutorial in master_dataset['tutorials']:
    if tutorial['tutorial_title'] == 'Birst Cloud Agent Installation':
        tutorial['feedback'] = "The steps are easy to follow and understand. You must capitalize the word's first letter directly after a colon. The tone needs to remain formal. Refrain from using words, like we, ours, our, us, and let's."

    elif tutorial['tutorial_title'] == 'How to connect to a database':
        tutorial['feedback'] = "The steps are easy to follow and understand. You must capitalize the word's first letter directly after a colon. The tone needs to remain formal. Refrain from using words, like we, ours, our, us, and let's."

    elif tutorial['tutorial_title'] == 'How to connect to a file':
        tutorial['feedback'] = "The steps are easy to follow and understand. Fix the wording for the Visualizer description because 'Visualizer: Once you have created your model and processed the data once, you can update your data by uploading a new file and have Birst process it, and then take you directly to Visualizer to continue working on visualizations and reports.' sounds odd. The fixed sentence should be 'Visualizer: Once you have created your model and processed the data once, you can update your data by uploading a new file and have Birst process it. It then takes you directly to Visualizer to continue working on visualizations and reports.'. Fix the wording for the Dashboards description because 'Dashboards: Once you have created your model and processed the data once, you can update your data by uploading a new file and have Birst process it, and then take you directly to Dashboards to continue your work.' sounds odd. The fixed sentence should be 'Dashboards: Once you have created your model and processed the data once, you can update your data by uploading a new file and have Birst process it. It then takes you directly to Dashboards to continue your work.'." 

    elif tutorial['tutorial_title'] == 'Changing visualizations based on user selection':
        tutorial['feedback'] = "To main a professional tone, refrain from using words, like us or we. Otherwise, steps are easy to follow and understand."

    elif tutorial['tutorial_title'] == 'Dashboard filters of dynamically changing Measures and Dimensions':
        tutorial['feedback'] = "To improve clarity, explain what the variables do, such as the 'GetPromptValue' variable has a description to explain its function. Provide more information about the difference between a 'measure' and a 'dimension' and when to use them."

    elif tutorial['tutorial_title'] == 'Highlight the Performance on a Geographical Map':
        tutorial['feedback'] = "To main a professional tone, refrain from using words, like us or we. Provide more information for when to use a 'Professional' space and when to use an 'Enterprise' space. Also preface the tutorial by stating 'The tutorial is for a Professional space' instead of at the end."
        
    elif tutorial['tutorial_title'] == 'Dynamically change Time Series Types in Reports':
        tutorial['feedback'] = "Define 'trailing half year, trailing four months' for better understanding. Define 'Running Sum' and what it is used for. To main a professional tone, refrain from using words, like us or we."

    elif tutorial['tutorial_title'] == 'Keep Analytics Data Updated':
        tutorial['feedback'] = "To main a professional tone, refrain from using words, like us or we. New terms are introduced and defined nicely."
        
    elif tutorial['tutorial_title'] == 'Limit number of years/quarters/months visible in a filter':
        tutorial['feedback'] = "The business problem needs to be defined better because it is too vague. Best practice tips are helpful and make the tutorial easier to understand. When a feature has different options, explain what the options are so that the user knows which to pick."
        
json.dumps(master_dataset)




In [18]:
master_dataset['tutorials']

[{'tutorial_title': 'Birst Cloud Agent Installation',
  'business_problem': 'You wish to use cloud analytics for insights on your on-premise data but don’t know how to connect them. Birst Cloud Agent (Birst) simplifies the process of transferring, accessing, and integrating data into a cloud-based analytics platform. Not only is the process more secure, reliable, and efficient, but Birst also eliminates the need for manual data extraction and manipulation. Birst serves as the bridge between on-premises data sources and the Birst cloud environment, which ensures that your data is up-to-date and accessible for accurate reporting and analysis in real-time. This way, you and your organization can make data-driven decisions quicker, improve operational efficiency, and gain valuable insights from your data across different systems and databases.',
  'requirements': ['Internet Connection.',
   'Birst Analytics Platform Account Credentials.',
   'Birst Cloud Agent.',
   'Data Sources.',
   'Ja

# Setting up and running the model

In [20]:
def preprocess_pre_data(examples):

    # Combine title and description for input text
    inputs = [
        f"{title} - {description}" 
        for title, description in zip(examples["application_title"], examples["application_description"])
    ]

    # print(inputs)

    # Tokenize the inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Create labels for unsupervised pre-training
    model_inputs["labels"] = model_inputs["input_ids"]

    return model_inputs


In [21]:
def preprocess_train_data(examples):

    # Extract individual lists from examples dictionary
    business_problems = examples["business_problem"]
    tutorial_titles = examples["tutorial_title"]
    tutorial_steps = examples["tutorial_steps"]
    feedback = examples["feedback"]

    # Format inputs to include business problem, tutorial title, and steps
    inputs = [
        f"Business Problem: {problem} Tutorial Title: {title} Tutorial Steps: {steps}" 
        for problem, title, steps in zip(business_problems, tutorial_titles, tutorial_steps)
    ]

    # Format targets to just include the feedback 
    targets = feedback

    # Tokenize the inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    model_targets = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs['labels'] = model_targets['input_ids']


    return model_inputs


In [22]:
# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small', legacy=False)
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Prepare the pre-training dataset
pretraining_data = [
    {
    'application_title': master_dataset['application_title'],
    'application_description': master_dataset['application_description']
    }
]

pre_dataset = Dataset.from_list(pretraining_data)
tokenized_pre_dataset = pre_dataset.map(preprocess_pre_data, batched=True)

# Run the model (pre-train)
pre_training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=32,
    num_train_epochs=25,
    logging_dir='./project_logs',
    logging_steps=2
)

pre_trainer = Trainer(
    model=model,
    args=pre_training_args,
    train_dataset=tokenized_pre_dataset,
)

pre_trainer.train()

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Step,Training Loss
2,11.7392
4,7.0816
6,4.4561
8,3.3403
10,2.9423
12,2.6634
14,2.4884
16,2.4378
18,2.263
20,2.2436


TrainOutput(global_step=25, training_loss=3.7452583789825438, metrics={'train_runtime': 66.9911, 'train_samples_per_second': 0.373, 'train_steps_per_second': 0.373, 'total_flos': 3383545036800.0, 'train_loss': 3.7452583789825438, 'epoch': 25.0})

In [23]:
def preprocess_function(examples):
    
    # Extract individual lists from examples dictionary
    # business_problems = examples["business_problem"]
    # tutorial_titles = examples["tutorial_title"]
    tutorial_steps = examples["tutorial_steps"]
    targets = examples["feedback"]

    custom_prefix = "Review the tutorial:"

    # Format inputs to include business problem, tutorial title, and steps
    inputs = [
        # f"Business Problem: {problem} Tutorial Title: {title} Tutorial Steps: {steps}"
        f"{custom_prefix} {steps}"
        for steps in tutorial_steps
        # for problem, title, steps in zip(business_problems, tutorial_titles, tutorial_steps)
    ]

    # Tokenize the inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    # Set up the tokenizer for targets
    labels = tokenizer(text_target=targets, max_length=512, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']


    return model_inputs


In [24]:
# Prepare the training dataset
training_data = []

# Preparing training dataset
for entry in master_dataset['tutorials']:
    business_problem = entry['business_problem']
    title = entry['tutorial_title']
    steps = entry['tutorial_steps']
    feedback = entry['feedback']
          
    input = {
        # 'business_problem': business_problem,
        # 'tutorial_title': title,
        'tutorial_steps': steps,
        'feedback': feedback
    }
    
    training_data.append(input)

# Add more tutorials to dataset 
steps = "Set up your Birst environment and create a space that is most appropriate for your project. Connect to Data sources by clicking on the 'add new source' option in the data sources section. We can connect to databases, cloud services, or other files. Extract and import the data from the connected source. Go to the data preparation section and perform data cleansing actions. Apply transformations to normalize the data and validate the transformed data for accuracy and consistency. Create data models and define relationships between datasets for a more unified data model. Build dashboards and reports in the dashboards section. Be sure to regularly monitor the performance of the data integration and reporting processes."
input_tutorial = {
        # 'business_problem': 'Different departments have their own databases and data sources, making it difficult to get a unified view of the business. Birst can integrate data from multiple sources to provide a single, cohesive view, enabling better decision-making.',
        # 'tutorial_title': 'How to resolve Data Silos.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Explain specific aspects of Birst features, such as explain what the differences are between different spaces. Refrain from using 'we' for a more formal tone."
    }
training_data.append(input_tutorial)

steps =  "Set up your Birst environment and create a space that is most appropriate for your project. Connect to Data sources by clicking on the 'add new source' option in the data sources section. You can connect to databases, cloud services, or other files. Go to the data preparation section and perform data cleansing actions. Apply transformations to normalize the data and validate the transformed data for accuracy and consistency. Create data models and define relationships between datasets for a more unified data model. Set up self-service portals, provide training sessions and resources, and create templates for reports and dashboards. Build reports and dashboards within your space. Share them with relevant stakeholders via links and emails."
input_tutorial2 = {
        # 'business_problem': 'Generating reports and performing data analysis is time-consuming and often requires IT intervention. Birst offers a self-service BI platform, allowing users to create reports and dashboards quickly without relying on IT.',
        # 'tutorial_title': 'Improving slow reporting and analysis.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "The steps maintain a professional tone. Steps are too vague. Provide explanations for why the user has to perform these actions. Define the specifications of Birst features."
    }
training_data.append(input_tutorial2)

steps = "Set up your Birst environment and create a space that is most appropriate for your project. Connect to Data sources by clicking on the 'add new source' option in the data sources section. You can connect to databases, cloud services, or other files. Initiate data extraction from each connected source. Go to the data preparation section and perform data cleansing actions. Apply transformations to normalize the data and validate the transformed data for accuracy and consistency. Set up validation rules to ensure data integrity and accuracy and execute them. Create data models and define relationships. Build reports and dashboards within your space. Regularly monitor the data quality to ensure it remains accurate."
input_tutorial3 = {
        # 'business_problem': 'Inconsistent and inaccurate data leading to poor business decisions. Birst’s data preparation tools help clean and standardize data, ensuring that decisions are based on accurate and reliable information.',
        # 'tutorial_title': 'Decreasing inaccurate data with Birst.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "The steps maintain a professional tone. Steps are too vague. Provide explanations for why the user has to perform these actions. Define the specifications of Birst features."
    }
training_data.append(input_tutorial3)

steps = "Connect to real-time data sources. Use Birst's data connectors to integrate real-time data with existing datasets. Create dashboards that visualize the data with live updates. Set up alerts and notifications to inform stakeholders of important real-time changes. Continuously monitor the performance of real-time data streams and optimize as needed."
input_tutorial4 = {
        # 'business_problem': 'Decisions are made based on outdated information. Birst can provide real-time analytics and dashboards, allowing businesses to make timely decisions based on the most current data available.',
        # 'tutorial_title': 'How to view real-time insights in Birst.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Steps are too vague, explain why the user has to perform these steps and what features to utilize. Steps should also include opening the Birst application and how to set up a workspace."
    }
training_data.append(input_tutorial4)

steps = "Establish clear data governance policies and document them. Use Birst's data governance tools to enforce access controls, data lineage, and data quality rules. Assign roles and permissions. Set up auditing and logging mechanisms to track data access and changes. Educate users on data governance policies."
input_tutorial5 = {
        # 'business_problem': 'Lack of data governance leading to compliance issues and data misuse. Birst includes data governance features to ensure that data is managed, controlled, and used correctly.',
        # 'tutorial_title': 'Managining poor data governance in Birst.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Steps are too vague, explain why the user has to perform these steps and what features to utilize. Specify where the user should establish clear data governance policies."
    }
training_data.append(input_tutorial5)

steps = "Connect and import data from various sources. Use Birst's automated data modeling tools to create initial data models. Make manual adjustments to the data model as needed to fit specific business requirements. Establish relationships between tables and datasets. Validate the data models to ensure accuracy."
input_tutorial6 = {
        # 'business_problem': "Difficulty in creating and maintaining complex data models. Birst's automated data modeling capabilities simplify the process, making it easier to create and manage data models. Explain how the user can set up auditing and logging mechanisms.",
        # 'tutorial_title': 'How to use Birst for complex data modeling.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Steps are too vague, explain why the user has to perform these steps and what features to utilize. Do not skip over steps in between, such as creating a work space when the user opens the Birst application."
    }
training_data.append(input_tutorial6)

steps = "Collaborate with stakeholders to identify key performance indicators relevant to the business. Define the metrics and data sources needed to calculate these indicators. Build customizable dashboards to visualize and track them. Establish targets and benchmarks for each KPI. Regularly review the dashboards to monitor performance."
input_tutorial7 = {
        # 'business_problem': 'Inability to track key performance indicators (KPIs) effectively. Birst allows businesses to define, track, and visualize KPIs through customizable dashboards and reports.',
        # 'tutorial_title': 'Improving ineffective performance metrics in Birst.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Steps are too vague, explain why the user has to perform these steps and what features to utilize. Do not skip over steps in between, such as creating a work space when the user opens the Birst application. Specify where the user can define the metrics and how to establish targets and benchmarks."
    }
training_data.append(input_tutorial7)

steps = "Connect to various customer data sources. Use Birst to unify customer data into a single dataset. Create segments based on customer behavior and preferences. Use the insights to drive marketing sales and customer service strategies."
input_tutorial8 = {
        # 'business_problem': 'Incomplete view of customer behavior and preferences. Birst can integrate customer data from various channels to provide a comprehensive view, aiding in better customer relationship management.',
        # 'tutorial_title': 'How to integrate customer data in Birst.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Steps are too vague, explain why the user has to perform these steps and what features to utilize. Do not skip over steps in between, such as creating a work space when the user opens the Birst application. Specify where the user can unify data and create segments in Birst."
    }
training_data.append(input_tutorial8)

steps = "Enable Birst's collaboration features. Create and share dashboards and reports with team members. Use annotations and comments to discuss insights and share feedback. Implement version control to manage changes and updates to reports and dashboards. Provide training on collaboration tools and best practices."
input_tutorial9 = {
        # 'business_problem': 'Difficulty in sharing insights and collaborating on data analysis across teams. Birst supports collaborative analytics, allowing team members to share reports, dashboards, and insights easily.',
        # 'tutorial_title': 'How to collaborate in Birst.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Steps are too vague, explain why the user has to perform these steps and what features to utilize. Do not skip over steps in between, such as creating a work space when the user opens the Birst application. Specify how the user can use annotations and how to implement version control."
    }
training_data.append(input_tutorial9)

steps = "Assess the current and future scalability needs of the business. Implement Birst in a cloud environment to leverage its scalable infrastrucutre. Monitor and manage cloud resources to ensure they are allocated efficiently. Set up automated scaling rules to handle increased data volumes and user numbers. Continuously monitor performance and optimize as needed."
input_tutorial10 = {
        # 'business_problem': "Existing business intelligence infrastructure can't keep up with the business growth. Birst is a scalable solution that can grow with the business, handling increasing data volumes and user numbers efficiently.",
        # 'tutorial_title': 'How to scale a solution in Birst.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Steps are too vague, explain why the user has to perform these steps and what features to utilize. Do not skip over steps in between, such as creating a work space when the user opens the Birst application. Specify how the user can implement Birst in a cloud environment."
    }
training_data.append(input_tutorial10)

steps = "Collaborate with stakeholders to identify areas where predictive analytics can add value. Gather historical data required for predictive modeling. Integrate Birst with advanced analytics tools. Develop predictive models using historical data and advanced analytics tools. Create dashboards to visualize future trends and forecasts. Continuously refine predictive models based on new data and feedback."
input_tutorial11 = {
        # 'business_problem': 'Inability to forecast future trends and performance. Birst integrates with advanced analytics tools to provide predictive analytics capabilities, helping businesses anticipate trends and plan accordingly.',
        # 'tutorial_title': 'How to make predictions for future trends and performance in Birst.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Steps are too vague, explain why the user has to perform these steps and what features to utilize. Do not skip over steps in between, such as creating a work space when the user opens the Birst application. Define specific criteria for historical data and how to develop predictive models using that data. Explain how to create dashboards with more detail and how to refine the models."
    }
training_data.append(input_tutorial11)

steps = "Emphasize Birst's user-friendly during training sessions. Provide comprehensive training on Birst's self-service capabilities. Develop onboarding programs to assist new users in getting started with Birst. Keep users informed about new features and updates. Offer ongoing support resources to help users navigate and use the business intelligence tools effectively."
input_tutorial12 = {
        # 'business_problem': "Low adoption rates of business intelligence tools among employees. Birst's user-friendly interface and self-service capabilities can drive higher adoption rates among non-technical users.",
        # 'tutorial_title': "Using Birst's user-friendly interface.",
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Define what onboarding programs will consist of. Provide examples for how the user can discuss new features and updates."
    }
training_data.append(input_tutorial12)

steps = "Conduct a security assessment to identify potential risks and compliance requirements. Implement robust access controls to restrict data access to authorized users only. Enable data encryption for data at rest and in transit. Configure Birst to comply with relevant industry standards and regulations. Set up audit logs to track data access and modifications for compliance purposes. Provide security training to users to ensure they understand best practices and compliance requirements."
input_tutorial13 = {
        # 'business_problem': 'Difficulty in ensuring data security and compliance with regulations. Birst offers robust security features and compliance controls to protect sensitive data and comply with industry standards.',
        # 'tutorial_title': 'Analyzing the supply chain in Birst.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Explain what does a security assessment look like. Also define how to use Birst to unify supply chain data into a single dataset. Specify where the user can identify key supply chain metrics to monitor in Birst and what analytical tools that can be used."
    }
training_data.append(input_tutorial13)

steps = "Connect to supply chain data sources. Use Birst to unify supply chain data into a single dataset. Identify key supply chain metrics to monitor. Create dashboards to visualize supply chain performance and track key metrics. Use Birst's analytical tools to identify bottlenecks and inefficiencies. Implement data-driven strategies to optimize suplly chain processes based on insights gained from Birst."
input_tutorial14 = {
        # 'business_problem': "Lack of visibility and inefficiencies in the supply chain. Birst can analyze supply chain data to identify bottlenecks, optimize processes, and improve overall efficiency.",
        # 'tutorial_title': 'How to view real-time insights in Birst.',
        'tutorial_steps': sent_tokenize(steps),
        'feedback': "Explain in detail how the user can connect to a supply chain data sources and what analytical tools that Birst has to offer. Also provide more instructions for how to use those tools in Birst. Define what bottlenecks are and how they affect the supply chain."
    }
training_data.append(input_tutorial14)


In [25]:
OUT_DIR='results_t5small'
# model_checkpoint = "t5-small"
# model2 = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# tokenizer2 = T5Tokenizer.from_pretrained(model_checkpoint, legacy=False)

metric = load("rouge")

# 90% for training data and 10% for validation
train_data, validation_data = train_test_split(training_data, test_size=0.1)

dataset_dict = DatasetDict({
    'train': Dataset.from_list(train_data),
    'validation': Dataset.from_list(validation_data),
})

tokenized_data = dataset_dict.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=32,
    num_train_epochs=50,
    logging_dir=OUT_DIR,
    logging_steps=5, 
    save_steps=10_000,
    save_total_limit=3
)


trainer = Trainer(
    model=model,
    args=training_args, 
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation']
)

trainer.train()


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Step,Training Loss
5,19.521
10,14.0747
15,10.3134
20,7.4265
25,5.044
30,3.4572
35,2.5159
40,2.0889
45,1.8268
50,1.7484


TrainOutput(global_step=50, training_loss=6.80167932510376, metrics={'train_runtime': 2235.1671, 'train_samples_per_second': 0.447, 'train_steps_per_second': 0.022, 'total_flos': 135341801472000.0, 'train_loss': 6.80167932510376, 'epoch': 50.0})

In [26]:
# Sample tutorial
problem = "Using a professional space gives business users access to Birst's self-service data preparation interface."
title = "How to use a professional space"
steps = "Collaborate with stakeholders to identify areas where predictive analytics can add value. Gather historical data required for predictive modeling. Integrate Birst with advanced analytics tools. Develop predictive models using historical data and advanced analytics tools. Create dashboards to visualize future trends and forecasts. Continuously refine predictive models based on new data and feedback."

custom_prefix = "Review the tutorial:"
input_text = f"{custom_prefix} {steps}"

# inputs = preprocess_input(problem, title, steps)
inputs = tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

# Get output (feedback)
outputs = model.generate(
    inputs.input_ids, 
    max_length=150, 
    num_beams=5,
    early_stopping=True
)

# Print output if it does not match the given input
for o in outputs:
    generated_sent = tokenizer.decode(o, skip_special_tokens=True)

    if (generated_sent.lower() != steps.lower()):
        print(generated_sent)
        
# generated_text = tokenizer2.decode(outputs[0], skip_special_tokens=True)
# print(generated_text)

Collaborate with stakeholders to identify areas where predictive analytics can add value. Gather historical data required for predictive modeling. Create dashboards to visualize future trends and forecasts. Continuously refine predictive models based on new data and feedback.


In [27]:
# Sentiment Analysis
def get_output():
    sentiment = SentimentIntensityAnalyzer()
    steps = "We can collaborate with stakeholders to identify areas where predictive analytics can add value."
    sent_res = sentiment.polarity_scores(steps)

    result_str = ""
    
    if "we" in steps.lower().split():
        result_str += "Refrain from using terms, such as 'we' and 'our'."
        result_str += " "
        
    if sent_res['neu'] == 1.0 or sent_res['neu'] >= 0.8:
        result_str +="Tutorial maintains a professional tone and clarity."
        result_str += " "
        
    else:
        result_str += "Tutorial should be written in a neutral tone to maintain professionalism."
        result_str += " "

    return result_str

In [58]:
m = tk.Tk()
m.title('Infor Developer Assistant (InDevA)')
m.geometry("600x400")

# Stores input tutorial
input = tk.StringVar() 

# Sentiment Analysis
def get_output(steps):
    sentiment = SentimentIntensityAnalyzer()
    sent_res = sentiment.polarity_scores(steps)

    result_str = ""
    
    if "we" in steps.lower().split():
        result_str += "Refrain from using terms, such as 'we' and 'our'."
        result_str += " "
        
    if sent_res['neu'] == 1.0 or sent_res['neu'] >= 0.8:
        result_str +="Tutorial maintains a professional tone and clarity."
        result_str += " "
        
    else:
        result_str += "Tutorial should be written in a neutral tone to maintain professionalism."
        result_str += " "

    return result_str

def submit():
    steps = input.get()
    result = get_output(steps)
    messagebox.showinfo("Result", result)
    
# Input label
input_label = tk.Label(m, text='Tutorial Input').grid(row=0)
# Entry box
entry = tk.Entry(m, textvariable=input, width=75)
entry.grid(row=0, column=1)
# Submit button
sub_bttn = tk.Button(m, text='Enter', command=submit)
sub_bttn.grid(row=1, column=1)
m.mainloop()

# DEMO INPUT
We can collaborate with stakeholders to identify areas where predictive analytics can add value.
