# Chat with your data

### This file is for educational purpose, it explains the flow from data to chat.

### The easiest way to run this file is in Google Colab.
### https://colab.research.google.com/
### Log in with your Google-account and add this file in "Ladda Upp".

### In Colab you can run all cells with Ctrl+F9 or press Shift+Enter to run chosen cell.


In [None]:
# In this cell we install our packages

!pip install -qU \
    langchain \
    openai==0.28.0 \
    pinecone-client==2.2.4 \
    tiktoken==0.5.1

In [21]:

import os


# Add your unique key here.
os.environ["OPENAI_API_KEY"] ="ENTER-YOUR-OPENAI-KEY-HERE"

### If you have the data in a excel-file you can use the read_csv function. We have removed the top line of the excel-file and and saved it as a csv before we insert the the path in the function.
### All the columns are used as the data. We choose to do this so it would be more easier to use any machine data. All the missing values will be representated by 'null'. This is not necessary but it will help to ai if the data is consistent.

In [31]:


import pandas as pd
import numpy as np

# If you are running this in Google Colab you have to add the file to a folder. Press the folder icon on the left side of screen. Add the path to your file below.
data = pd.read_csv(r"data\2830612_VMMS_EWO_KPI_clean.csv", sep=";")


data.replace({np.nan: 'null', 'NA': 'null', 'N/A': 'null', 'Missing': 'null'}, inplace=True)

In [None]:
# Take a look at the data. This code only shows the first five rows.
data.head()

Site                                                                                                                    SDE
EWO WO Object Id                                                                                                    2830612
EWO WO Object Description                                                                                     Sandberedning
EWO WO Machine Class PM                                                                                                   A
EWO WO Machine Class AM                                                                                                null
                                                                                                                  ...      
DT (Hours between EWO WO Break Down and Restart)                                                                       2,68
DTOP (Hours between EWO WO Break Down and EWO WO Reg Date +  Root Cause WO Real Finish and EWO WO Restart)             0,21
WTOP  (H

In [33]:
# We have custom made our own Documentloader.

from typing import Any, Iterator, List

import pandas as pd

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class DataFrameLoader2(BaseLoader):
    """Load from a Pandas DataFrame."""

    def __init__(self, data_frame: Any, ewo_wo_no_column: str = "EWO WO No"):
        """Initialize with a Pandas DataFrame.

        Args:
            data_frame: Pandas DataFrame.
            ewo_wo_no_column: Name of the column to include as metadata. Defaults to "EWO WO No".
        """
        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError(
                f"Expected data_frame to be a pd.DataFrame, got {type(data_frame)}"
            )
        self.data_frame = data_frame
        self.ewo_wo_no_column = ewo_wo_no_column

    def lazy_load(self) -> Iterator[Document]:
        """Lazy load records from the Pandas DataFrame."""

        for _, row in self.data_frame.iterrows():
            text = "\n".join([f"{key}: {value}" for key, value in row.items()])
            metadata = row.to_dict()
            ewo_wo_no_value = row[self.ewo_wo_no_column]
            metadata["EWO WO No"] = ewo_wo_no_value
            yield Document(page_content=text, metadata=metadata, column_names=list(self.data_frame.columns))

    def load(self) -> List[Document]:
        """Load full Pandas DataFrame."""
        return list(self.lazy_load())

### There ara many types of loaders that create the data into a Document. A Document consist of page_content and metadata
### In the different types of loaders you can specify which data/column is page_content and which is metadata.

### Read all about it here https://python.langchain.com/docs/modules/data_connection/document_loaders

In [34]:

loader = DataFrameLoader2(data)

data = loader.load()

In [35]:
data[0]

# Show the first Document

Document(page_content='Site: SDE\nEWO WO Object Id: 2830612\nEWO WO Object Description: Sandberedning\nEWO WO Machine Class PM: A\nEWO WO Machine Class AM: null\nEWO WO AM Step: 1.0\nEWO WO OP System: null\nEWO WO PM Step: 0.0\nEWO WO RP Capacity: null\nEWO WO RP Competence: null\nEWO WO RP Cond Wear: null\nEWO WO RP Method: null\nEWO WO RP Quality: null\nEWO WO RP Spare Part: null\nEWO WO RP EEM Sum: 0\nEWO WO Repl Policy: null\nEWO WO RP Phase: null\nEWO WO RP Status: null\nEWO WO Object Manufacturer No: null\nEWO WO Object Manufacturer Name: null\nEWO WO Object Model: null\nEWO WO Object Supplier Id: null\nEWO WO Object Supplier Name: null\nEWO WO Object Manufactured Date: null\nEWO WO Object Cost Center: 1127\nEWO WO Object Belongs to Obejct Id: G1-KMEB-SB-121\n040-PRODAREA: G1-KMEB\n070-STATION: G1-KMEB-SB-121\nEWO WO No: 18913\nEWO WO Directive: Otakt sandberedning\nEWO WO Fault Desc: E-bana. Anders 76313\nEWO WO Maint Org: G-EL\nEWO WO Break Down Date: 2013-12-12 05:40:00\nEWO W

In [37]:
data[0].metadata

# Run this cell to show the metadata

{'Site': 'SDE',
 'EWO WO Object Id': '2830612',
 'EWO WO Object Description': 'Sandberedning',
 'EWO WO Machine Class PM': 'A',
 'EWO WO Machine Class AM': 'null',
 'EWO WO AM Step': 1.0,
 'EWO WO OP System': 'null',
 'EWO WO PM Step': 0.0,
 'EWO WO RP Capacity': 'null',
 'EWO WO RP Competence': 'null',
 'EWO WO RP Cond Wear': 'null',
 'EWO WO RP Method': 'null',
 'EWO WO RP Quality': 'null',
 'EWO WO RP Spare Part': 'null',
 'EWO WO RP EEM Sum': 0,
 'EWO WO Repl Policy': 'null',
 'EWO WO RP Phase': 'null',
 'EWO WO RP Status': 'null',
 'EWO WO Object Manufacturer No': 'null',
 'EWO WO Object Manufacturer Name': 'null',
 'EWO WO Object Model': 'null',
 'EWO WO Object Supplier Id': 'null',
 'EWO WO Object Supplier Name': 'null',
 'EWO WO Object Manufactured Date': 'null',
 'EWO WO Object Cost Center': 1127,
 'EWO WO Object Belongs to Obejct Id': 'G1-KMEB-SB-121',
 '040-PRODAREA': 'G1-KMEB',
 '070-STATION': 'G1-KMEB-SB-121',
 'EWO WO No': 18913,
 'EWO WO Directive': 'Otakt sandberedning'

In [38]:
# Instead of the csv you can read in the data as a json-file. Add the file to your folder and change the file path in the code.

json = pd.read_json(r"C:\Users\debbi\NLP\vol.json", orient=True)

loader = DataFrameLoader2(json)
data = loader.load()


[Document(page_content='Site: SDE\nEWO WO Object Id: 2800053\nEWO WO Object Description: Loramendi 1\nEWO WO Machine Class PM: A\nEWO WO Machine Class AM: nan\nEWO WO AM Step: 3\nEWO WO OP System: nan\nEWO WO PM Step: 0\nEWO WO RP Capacity: nan\nEWO WO RP Competence: nan\nEWO WO RP Cond Wear: nan\nEWO WO RP Method: nan\nEWO WO RP Quality: nan\nEWO WO RP Spare Part: nan\nEWO WO RP EEM Sum: 0\nEWO WO Repl Policy: nan\nEWO WO RP Phase: nan\nEWO WO RP Status: nan\nEWO WO Object Manufacturer No: nan\nEWO WO Object Manufacturer Name: nan\nEWO WO Object Model: nan\nEWO WO Object Supplier Id: nan\nEWO WO Object Supplier Name: nan\nEWO WO Object Manufactured Date: nan\nEWO WO Object Cost Center: 1127\nEWO WO Object Belongs to Obejct Id: G1-KMEB-E1-102\n040-PRODAREA: G1-KMEB\n070-STATION: G1-KMEB-E1-102\nEWO WO No: 12466\nEWO WO Directive: bord går ej köra upp eller ner\nEWO WO Maint Org: G-L4-MEK\nEWO WO Break Down Date: 1385339400000\nEWO WO OP Arrives: 1385339400000\nEWO WO Reg Date: 13853412

### Embeddings and storing data

In [39]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings


embeddings = OpenAIEmbeddings()


In [None]:
# Here we assign the FAISS vector index to be the chats database. 
# FAISS (Facebook AI simularity search) is Meta library for handeling similarity searches..
db = FAISS.from_documents(data, embeddings)

In [None]:
# You can query your database.

query = db.similarity_search("Våg 1")


In [40]:
# The top most simulary documents to your query
query

[Document(page_content='Site: SDE\nEWO WO Object Id: 2800053\nEWO WO Object Description: Loramendi 1\nEWO WO Machine Class PM: A\nEWO WO Machine Class AM: nan\nEWO WO AM Step: 3\nEWO WO OP System: nan\nEWO WO PM Step: 0\nEWO WO RP Capacity: nan\nEWO WO RP Competence: nan\nEWO WO RP Cond Wear: nan\nEWO WO RP Method: nan\nEWO WO RP Quality: nan\nEWO WO RP Spare Part: nan\nEWO WO RP EEM Sum: 0\nEWO WO Repl Policy: nan\nEWO WO RP Phase: nan\nEWO WO RP Status: nan\nEWO WO Object Manufacturer No: nan\nEWO WO Object Manufacturer Name: nan\nEWO WO Object Model: nan\nEWO WO Object Supplier Id: nan\nEWO WO Object Supplier Name: nan\nEWO WO Object Manufactured Date: nan\nEWO WO Object Cost Center: 1127\nEWO WO Object Belongs to Obejct Id: G1-KMEB-E1-102\n040-PRODAREA: G1-KMEB\n070-STATION: G1-KMEB-E1-102\nEWO WO No: 40855047\nEWO WO Directive: Oljeläckage\nEWO WO Maint Org: G-KM-MEK\nEWO WO Break Down Date: 1681926600000\nEWO WO OP Arrives: 1681926900000\nEWO WO Reg Date: 1681927148000\nEWO WO Re

### The model 

In [42]:
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-4'
)


In [None]:
chat

### Prompt

In [44]:
# You can change the prompt but the {question} and {context} is key values that the models needs. Removing these will cause an error.

from langchain.prompts import PromptTemplate

template = """I will provide you with historical data regarding previous work orders (EWO) for a machine. By examining the historical data, you should respond by describing similar issues in the historical data and how those issues were resolved. Use all the columns and explain how a similar problem was previously resolved.

Before you do that, ask for a brief description of the issue that we will analyze using the historical data you've received.

This description may be concise and may not exactly match any previous ones in the history. Nevertheless, you should search the history for similar problems or solutions resembling the new issue description.

Especially, check what's mentioned in EWO WO Directive, EWO WO Work Done, and EWO WO Work Details. Also, think broadly because there might be misspellings or similar words with the same meaning.

Reply with the number of work orders available. Also, mention how many of them are similar to the issue we're currently analyzing.

Then, proceed with details about all of them. This includes all analyses you can perform about what was done, the time taken, parts replaced, downtime, etc. Time analyses are crucial. Specify the time taken to resolve the issue, the duration from discovery to operation, and, most importantly, thoroughly analyze the root cause. Include all time-related information.

If you encounter any issues while reading the file, do not mention it, just continue. Do not describe all the steps you take. The information you are getting is EWO and you will analyzie it with as much information as needed. The EWO WO ID is the link between the texts. If I ask you for date or time, this is the correct format to look for: YYYY-MM-DD HH:MI:SS. Give the answer about the date if someone asks. If you don't know the answer, just say that you don't know, don't try to make up an answer. If you encounter  some technical terms may not have direct translations in English write the whole answer in english.
If you are looking for a datetime the format to look for is YYYY:MM:DD HH:MI:SS. Don't answer if the chat_history is empty.
{context}


Question: {question}

Answer in Swedish:"""


PROMPT = PromptTemplate(
    template=template, input_variables=["question", "context"]
)

### If you want the chat to remember the conversation you need a memory.

In [45]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history")

In [46]:
memory

ConversationBufferMemory(memory_key='chat_history')

### Time for the final step - putting all the pieces together

In [49]:
from langchain.chains import RetrievalQA

gpt_pipeline = RetrievalQA.from_chain_type(
    llm=chat, chain_type='stuff',
    chain_type_kwargs = {"prompt": PROMPT},
    memory = memory,
    retriever=db.as_retriever()
)

In [52]:
gpt_pipeline("Hej")

{'query': 'Hej',
 'chat_history': '',
 'result': 'HALLO! Hur kan jag hjälpa dig idag?'}

In [None]:

# If you print the pipeline you can see all the information about the object.
print(gpt_pipeline)

In [105]:
gpt_pipeline("Kan du hjälpa mig hitta ett datum för när vi senast hade problem med sandvagnar")

{'query': 'Kan du hjälpa mig hitta ett datum för när vi senast hade problem med sandvagnar',
 'chat_history': 'Human: Kan du hjälpa mig hitta ett datum för detta?\nAI: Självklart, skulle du kunna vara mer specifik om vilket datum du söker? Till exempel, söker du efter datumet för när underhållet startade, när det avslutades, eller något annat?\nHuman: Kan du hjälpa mig hitta ett datum för detta, när hände det först?\nAI: Den första händelsen som du nämnde inträffade den 15 januari 2019. Det framgår av värdena för "EWO WO Break Down Date", som är märkt som 1547046000000 i Unix-tid, vilket motsvarar den 15 januari 2019.\nHuman: Hej\nAI: HALLO! Hur kan jag hjälpa dig idag?\nHuman: Kan du hjälpa mig hitta ett datum för när vi senast hade problem med våg 1?\nAI: HALLO! Jag beklagar, men jag har ingen information om "våg 1" i datan du gav mig. Om du har mer specifik information eller data, vänligen dela den med mig så att jag kan hjälpa dig bättre.\nHuman: Kan du hjälpa mig hitta ett datum f

In [106]:
gpt_pipeline("Kan du hämta Kaizen åtgärd för detta?")

{'query': 'Kan du hämta Kaizen åtgärd för detta?',
 'chat_history': 'Human: Kan du hjälpa mig hitta ett datum för detta?\nAI: Självklart, skulle du kunna vara mer specifik om vilket datum du söker? Till exempel, söker du efter datumet för när underhållet startade, när det avslutades, eller något annat?\nHuman: Kan du hjälpa mig hitta ett datum för detta, när hände det först?\nAI: Den första händelsen som du nämnde inträffade den 15 januari 2019. Det framgår av värdena för "EWO WO Break Down Date", som är märkt som 1547046000000 i Unix-tid, vilket motsvarar den 15 januari 2019.\nHuman: Hej\nAI: HALLO! Hur kan jag hjälpa dig idag?\nHuman: Kan du hjälpa mig hitta ett datum för när vi senast hade problem med våg 1?\nAI: HALLO! Jag beklagar, men jag har ingen information om "våg 1" i datan du gav mig. Om du har mer specifik information eller data, vänligen dela den med mig så att jag kan hjälpa dig bättre.\nHuman: Kan du hjälpa mig hitta ett datum för när vi senast hade problem med sandvagn

In [73]:

gpt_pipeline("Har detta hänt tidigare? Hur löstes det då?")

{'query': 'Har detta hänt tidigare? Hur löstes det då?',
 'chat_history': 'Human: Vi har problem med Våg 1. Har detta hänt tidigare?\nAI: Som en AI har jag ingen direkt tillgång till tidigare specifika händelser eller datahistorik. Jag rekommenderar att du kontrollerar din egen underhålls- och servicehistorik för Våg 1 eller pratar med en tekniker i ditt team.',
 'result': 'Ja, detta har hänt tidigare. I ett tidigare fall (EWO WO No: 40044019) visade felet sig vara relä K122.8 i gasanläggningen (Gasning klar, Loramendi 1). Relät byttes 2019-01-15, samt motsvarande reläer till Loramendi 2 (K162.8) och 3 (K202.8). Dessutom byttes en givare för huvudcylinder nere. I ett annat fall (EWO WO No: 40044174) byttes relä K122.8 i gasanläggningen.'}

In [None]:
gpt_pipeline("Hur långt tid tog det att lösa problem med Våg 1 vid varje fall?")

In [None]:
gpt_pipeline("Finns det en Kaizen åtgärds gjort på fall rörande våg 1?")


In [None]:
gpt_pipeline("vilken tidpunkt löstes det?")