## Environment Setup

In [38]:
# IMPORTING WARNINGS
import warnings
warnings.filterwarnings("ignore")

import re
import pandas as pd
import json
import openai
import random
import time
from tenacity import (retry, stop_after_attempt, wait_random_exponential)
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

# SETTING DISPLAY OPTIONS FOR PANDAS DATAFRAMES
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# DISABLING DEBUGGING MODE FOR THE LANGCHAIN FRAMEWORK
import langchain
langchain.debug = False

In [39]:
# READING DATA
df = pd.read_json("../01.input_files/datafinal.json")

# WRITING DATA
df.to_csv('../01.input_files/datafinal_csv.csv',index=False)

In [40]:
# OPEN API KEY
openai.api_key = ''#ADD YOUR OPEN API KEY

## Preprocessing

In [41]:
def add_backslash(text):
    special_chars = re.compile(r'([\"\'/])')
    return special_chars.sub(r'\\\1', text)

In [42]:
df['Text'] = df['Text'].apply(lambda x: add_backslash(x))
df['Text'] = df['Text'].apply(lambda x: x.replace('\n', '\\n').replace('\t', '\\t'))

In [43]:
filtered_df = df[df['CodeList']!=''].sample(50)

In [44]:
filtered_df.shape

(50, 4)

## Prompt Testing

In [34]:
@retry(wait=wait_random_exponential(min=60, max=65), stop=stop_after_attempt(6))
def text_extraction(text_var):
    prompt = """
    Extract the code and command from text delimited by <text> tag. Let's work this out in a step by step way to be sure we have the right answer.
	STEP 1: Identify all special characters and program language specific keyword specified in the provided text.
	STEP 2: Examine the words surrounding the special characters closely to determine the programming language and tools being referenced.
    STEP 3: Extract all programming code and tool-specific commands that belong to the all programming languages and tools identified in STEP 2.if there are multiple codes and commands, separate them by a comma.
    Format the response as a JSON object with the key 'code'. If no code is found, the 'code' key should contain an empty string. Don't output the programming language.
    Keywords: code
    text:<text> 
    """+text_var+"""</text>
    ````
    The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "json```":
    ```json
    {
    "code": "executable programming code, tool specific command"
    }```
    """

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    time.sleep(1)
    return response["choices"][0]["message"]["content"]

filtered_df['llm_response'] = filtered_df.apply(lambda row: text_extraction(row['Text']), axis=1)

In [35]:
def text_extraction(text_var):
    try:
        CodeSchema = ResponseSchema(name="code", description="executable programming code with comment, tool specific command")
        response_schemas = [CodeSchema]
        output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
        chain_dict = output_parser.parse(text_var)
        code_var = chain_dict.get('code')
        return code_var
    except:
        return text_var

In [36]:
filtered_df['code'] = filtered_df['llm_response'].apply(lambda x: text_extraction(x))

In [37]:
filtered_df.to_csv('filtered_result_csv.csv',index=False)

# Inference

The prompt is doing what we expected it to do. It works best when there are no extra lines and when the entire prompt is in one piece. To make it work even better, we should make the ' and '' as part of the string itself by adding a backslash. The way the prompt is set up:

When I used the Large Language Model (LLM) to find code, it didn't work very well. So, when I asked LLM to find the programming language and the related code, it worked well.
To improve its performance even further, I instructed the LLM to locate special characters and keywords, and to consider the words around them for identifying the programming language and related code. This approach worked very well.