In [None]:
#Supporting function for analysis 
import os
import re
import pandas as pd
import time


#agent id from mistral console
agent = "ag:ac256ded:20250525:impact-research:a7aac9eb"

def process_change_requests(model_path, change_requests_path, client, cycle_number, pause = 0):
    """
    Process change requests against a model and save impact analysis results as a single dataframe.
    Also saves individual CSV files for each change request.
    
    Args:
        model_path (str): Path to the model data JSON file
        change_requests_path (str): Path to the change requests text file
        client: The chat client object for making requests
        model: The model to use for analysis
    Returns:
        pd.DataFrame: Combined dataframe with all change request impacts
    """
    # Function to extract final answer block from response
    def extract_suggestions(text):
        code_blocks = re.findall(r"&&&(.*?)&&&", text, re.DOTALL)
        return code_blocks[0].strip() if code_blocks else ""

    
    # Load model data
    with open(model_path, "r", encoding="utf-8") as f:
        model_data = f.read()

    # Load change requests
    with open(change_requests_path, "r", encoding="utf-8") as f:
        change_requests = [line.strip() for line in f if line.strip()]

    # Create outputs directory if it doesn't exist
    os.makedirs("outputs", exist_ok=True)

    # Initialize combined dataframe
    combined_df = pd.DataFrame(columns=["Component", "Reasoning", "Change", "Cycle"])
    
    
    # Process each change request
    for idx, change_request in enumerate(change_requests, start=1):
        print(f"Processing request {idx}/{len(change_requests)}...")

        prompt = f"""
        You are the system engineer. You have to conduct impact analysis of incoming change request. You have model data as a text. In the backtick brackets the elements with attributes and the explicit connections with other elements are provided. Model data:
        ```{model_data}```
        The change request:`{change_request}`. List the components in the system that could be influenced by that change with the reasoning. Please, be specific! Please, provide at first the components that directly affected by that change, and then components affected by the change of the first set. So, in the intermediate answer I expect 1st level of change and then 2nd level of change caused by 1st level changes. After that provide the final output with format provided in Final Output Example with ONLY 1st level of changes
        Final Output Example:
        &&&
        Component: <component_name>  
        Reasoning: <reasoning_text>  

        Component: <component_name>  
        Reasoning: <reasoning_text>  
        &&&
        Provide no other text that required by your task.
        """
        chat_response = client.agents.complete(
            agent_id=agent,
            messages=[{"role": "user", "content": prompt}]
        )

        response_text = chat_response.choices[0].message.content

        # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # response_filename = os.path.join(os.getcwd(),"outputs","raw_responses.txt")
        
        # with open(response_filename, "a", encoding="utf-8") as f:
        #     f.write(f"Change Request: {change_request}\n\n")
        #     f.write(response_text)
        #     f.write("\n\n")
        # print(f"Saved raw response to {response_filename}")


        data_string = extract_suggestions(response_text)

        if not data_string:
            print(f"No valid output for change request {idx}")
            continue

        entries = data_string.split('\n\n')
        data = []

        for entry in entries:
            component_match = re.search(r'Component:\s*(.*)', entry)
            reasoning_match = re.search(r'Reasoning:\s*(.*)', entry, re.DOTALL)
            if component_match and reasoning_match:
                component = component_match.group(1).strip()
                reasoning = reasoning_match.group(1).strip().replace('\n', ' ')
                data.append({
                    "Component": component, 
                    "Reasoning": reasoning,
                    "Change": idx, # Add change request ID
                    "Cycle": cycle_number
                })

        df = pd.DataFrame(data)
        
        # Add to combined dataframe
        combined_result_result_result_result_result_result_result_result_df = pd.concat([combined_df, df], ignore_index=True)
        time.sleep(pause)
    # Save combined dataframe
    # combined_csv_path = os.path.join(os.getcwd(), "outputs", f"{workdir_name}_combined_impacts.csv")
    # combined_df.to_csv(combined_csv_path, index=False, encoding='utf-8')
    # print(f"Saved combined impacts to {combined_csv_path}")
    
    return combined_df

In [None]:
import os
import pandas as pd
import time
from mistralai import Mistral

# Initialize
all_runs_df = pd.DataFrame()
api_key = os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=api_key)

# Paths
model_path = r"C:\Users\Valentin\YandexDisk\Учеба\skoltech\research\data_for_impact_analysis\Gekta\gekta-export.json"
change_requests_path = r"C:\Users\Valentin\YandexDisk\Учеба\skoltech\research\data_for_impact_analysis\Gekta\change_requests.txt"

num_cycles = 20
pause_between_cycles = 2  # seconds between full cycles
pause_between_requests = 1  # seconds between individual change requests

for cycle in range(1, num_cycles + 1):
    print(f"\n=== Running cycle {cycle}/{num_cycles} ===")
    
    # Process requests with cycle number tracking
    df = process_change_requests(
        model_path, 
        change_requests_path, 
        client, 
        cycle_number=cycle,
        pause=pause_between_requests  # Pass delay to processing function
    )
    
    all_runs_df = pd.concat([all_runs_df, df], ignore_index=True)
    
    # Pause between cycles (except after last one)
    if cycle < num_cycles:
        print(f"Waiting {pause_between_cycles} seconds before next cycle...")
        time.sleep(pause_between_cycles)


=== Running cycle 1/20 ===
Processing request 1/7...
Processing request 2/7...
Processing request 3/7...
Processing request 4/7...
Processing request 5/7...
Processing request 6/7...
Processing request 7/7...
Waiting 2 seconds before next cycle...

=== Running cycle 2/20 ===
Processing request 1/7...
Processing request 2/7...
Processing request 3/7...
Processing request 4/7...
Processing request 5/7...
Processing request 6/7...
Processing request 7/7...
Waiting 2 seconds before next cycle...

=== Running cycle 3/20 ===
Processing request 1/7...
Processing request 2/7...
Processing request 3/7...
Processing request 4/7...
Processing request 5/7...
Processing request 6/7...
Processing request 7/7...
Waiting 2 seconds before next cycle...

=== Running cycle 4/20 ===
Processing request 1/7...
Processing request 2/7...
Processing request 3/7...
Processing request 4/7...
Processing request 5/7...
Processing request 6/7...
Processing request 7/7...
Waiting 2 seconds before next cycle...

===

In [9]:
# Save results
output_path = os.path.join("outputs", "all_runs_combined.csv")
os.makedirs("outputs", exist_ok=True)
all_runs_df.to_csv(output_path, index=False)
print(f"\nSaved combined results to {output_path}")


Saved combined results to outputs\all_runs_combined.csv


In [None]:
# Test script to extract the prompt sample from LLM
with open(model_path, "r", encoding="utf-8") as f:
        model_data = f.read()

change_request = "The microplate well volume has been changed from 400 ml to 600 ml."
prompt = f"""
        You are the system engineer. You have to conduct impact analysis of incoming change request. You have model data as a text. In the backtick brackets the elements with attributes and the explicit connections with other elements are provided. Model data:
        ```{model_data}```
        The change request:`{change_request}`. List the components in the system that could be influenced by that change with the reasoning. Please, be specific! Please, provide at first the components that directly affected by that change, and then components affected by the change of the first set. So, in the intermediate answer I expect 1st level of change and then 2nd level of change caused by 1st level changes. After that provide the final output with format provided in Final Output Example with ONLY 1st level of changes
        Final Output Example:
        &&&
        Component: <component_name>  
        Reasoning: <reasoning_text>  

        Component: <component_name>  
        Reasoning: <reasoning_text>  
        &&&
        Provide no other text that required by your task.
        """
print(prompt)

In [None]:
#change the ids to change request text
import pandas as pd
import os
change_requests_path = r"C:\Users\Valentin\YandexDisk\Учеба\skoltech\research\data_for_impact_analysis\Gekta\change_requests.txt"
df = pd.read_csv(r"C:\Users\Valentin\YandexDisk\Учеба\skoltech\research\data_for_impact_analysis\Gekta\outputs\impacts1.csv")

with open(change_requests_path, "r", encoding="utf-8") as f:
    change_text = [line.strip() for line in f if line.strip()]
change_map = {idx+1 : text for idx, text in enumerate(change_text)}
df["Change"] = df["Change"].map(change_map)
df.to_csv(os.path.join("outputs", "to-the-form.csv"), index = False)

In [None]:
#Calculate the scores
import pandas as pd

#LLM data
df1 = pd.read_csv(r".\outputs\all_runs_combined.csv")
df1 = df1[df1["Cycle"] == 1]
df1 = df1[["Component", "Change"]]

#Human data
df2 = pd.read_csv(r".\team_data\lead.csv")
df2 = df2[["Component", "Change"]]



{'Pump', 'Microcomputer', 'Alarm sub-system', 'Reagent volume', 'Software', 'Hardware', 'Gekta', 'Camera for macro imaging', 'RPR reaction analyzer', 'Maintanance personel', 'CNC base frame', 'Railings', 'Laboratory'}


In [46]:
def common_components_for_change(change_val, df1, df2):
    """
    Calculate alighment score for one change value

    Args:
        change_val: id of the change
        df1: llm dataframe
        df2: engineer dataframe
    """
    # Get components for this change value in both DataFrames
    comp1 = df1[df1['Change'] == change_val]['Component']
    comp2 = df2[df2['Change'] == change_val]['Component']
    
    # Find intersection (common components)
    common = set(comp1) & set(comp2)
    len_common = len(common)
    len_eng = len(comp2)
    alignment = len_common/len_eng
    return alignment


change_requests_path = r"C:\Users\Valentin\YandexDisk\Учеба\skoltech\research\data_for_impact_analysis\Gekta\change_requests.txt"
with open(change_requests_path, "r", encoding="utf-8") as f:
    change_text = [line.strip() for line in f if line.strip()]

list = []

for change in range(1, 9):
    list.append(common_components_for_change(change, df1,df2))
    






In [47]:
list

[0.5,
 0.6666666666666666,
 0.3333333333333333,
 0.0,
 0.0,
 0.3333333333333333,
 0.0,
 0.0]

In [28]:
import pandas as pd
import re

# Load the CSV data
csv_file = r"C:\Users\Valentin\YandexDisk\Учеба\skoltech\research\data_for_impact_analysis\Gekta\team_data\lead_survey.csv"  # Replace with your actual file path

df = pd.read_csv(csv_file, header=None)

import re

# Extract the participant name from the second row, column 1
participant_name = df.iloc[1, 1]

# Prepare a list to hold the processed data
processed_data = []

# Iterate over columns 2 to 25 (indexes 2 to 25)
for col_idx in range(2, 26):
    # Extract the change description from the first row
    change_text = df.iloc[0, col_idx]
    
    # Extract the score from the second row
    score = df.iloc[1, col_idx]
    
    if "Change:" in change_text:
        change_description = re.search(r"Change:\s*(.*)", change_text).group(1)
    else:
        change_description = ""

    if "Component:" in change_text:
        component = re.search(r"Component:\s*(.*)", change_text).group(1)
    else:
        component = "Unknown"
    
    # Append the structured data
    processed_data.append({
        "Component": component,
        "Change": change_description,
        "Score": score,
        "Participant name": participant_name
    })

# Convert the list to a DataFrame
df = pd.DataFrame(processed_data)



In [None]:


# Read the change requests from the text file
with open('./change_requests.txt', 'r') as file:
    change_requests = [line.strip() for line in file.readlines() if line.strip()]

# Create a dictionary mapping change descriptions to their IDs (line numbers)
change_to_id = {change: idx+1 for idx, change in enumerate(change_requests)}

# Function to find the matching change ID
def find_change_id(change_description):
    # Try exact match first
    if change_description in change_to_id:
        return change_to_id[change_description]
    
    # If not found exactly, try to find the most similar one
    for change in change_to_id:
        if change in change_description or change_description in change:
            return change_to_id[change]
    
    # If still not found, return None or raise an error
    return None

#Apply the function to create the new change_id column
df['change_id'] = df["Change"].apply(find_change_id)

if df['change_id'].isnull().any():
    unmatched_count = df['change_id'].isnull().sum()
    print(f"Warning: Could not match {unmatched_count} change requests to IDs")
    print(df[df['change_id'].isnull()]['Change'].unique())
else:
    print("All change requests successfully matched with IDs")


All change requests successfully matched with IDs


Unnamed: 0,Component,Change,Score,Participant name,change_id
0,Pump,The pipetting precision has been increased fro...,5,1,1
1,CNC base frame,The pipetting precision has been increased fro...,1,1,1
2,Reagent volume,The pipetting precision has been increased fro...,2,1,1
3,CNC and pump control algorithm,The pipetting precision has been increased fro...,5,1,1
4,G-code generator from path defined,The pipetting precision has been increased fro...,1,1,1
5,Railings,The microplate size has been changed from 130x...,5,1,2
6,CNC base frame,The microplate size has been changed from 130x...,5,1,2
7,Pump,The microplate size has been changed from 130x...,1,1,2
8,Camera for macro imaging,The microplate size has been changed from 130x...,2,1,2
9,CNC and pump control algorithm,The required analysis time for one sample has ...,5,1,3


In [None]:
# Create dataframes
df1 = df.copy()
df2 = pd.read_csv(r"C:\Users\Valentin\YandexDisk\Учеба\skoltech\research\data_for_impact_analysis\Gekta\team_data\lead.csv")

# Step 1: Get components with score >=3 for each change_id
high_score_components = df1[df1['Score'] >= 3].groupby('change_id')['Component'].apply(list).reset_index()
print("Components with score >=3 by change_id:")
print(high_score_components)

# Step 2: Find components in high_score_components not present in df2
missing_components = []

for change_id, components in high_score_components.values:
    for component in components:
        # Check if component exists in df2 with matching change description
        change_description = df1[(df1['change_id'] == change_id) & 
                               (df1['Component'] == component)]['Change'].iloc[0]
        
        match = df2[(df2['Component'] == component) & 
                   (df2['Change'].str.contains(change_description.split()[0]))]
        
        if match.empty:
            missing_components.append({
                'change_id': change_id,
                'Component': component,
                'Change': change_description
            })

missing_df = pd.DataFrame(missing_components)
print("\nComponents not found in reference dataframe:")
print(missing_df)