In [6]:
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from tqdm import tqdm

# === 1. CONFIGURATION ===
API_KEY = "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"  # Replace with your OpenAI API key
openai_model = "text-embedding-3-small"

client = OpenAI(api_key=API_KEY)

# === 2. LOAD AND PREPARE TRAINING DATA ===
df_train = pd.read_excel("test01.xlsx")

# Combine fields into one input string
df_train["text"] = (
    df_train["Systeme"].astype(str) + " | " +
    df_train["Description"].astype(str) + " | " +
    df_train["Description de l'Ã©quipement"].astype(str)
)

# Drop rows with missing target scores
df_train = df_train.dropna(subset=["FiabilitÃ© IntÃ©gritÃ©", "DisponibiltÃ©", "Process Safety"])

X_train_text = df_train["text"].tolist()
y_train = df_train[["FiabilitÃ© IntÃ©gritÃ©", "DisponibiltÃ©", "Process Safety"]].values


# === 3. EMBEDDING FUNCTION ===
def get_embedding(text: str, model: str = openai_model) -> list:
    try:
        response = client.embeddings.create(input=[text], model=model)
        return response.data[0].embedding
    except Exception as e:
        print(f"Error embedding text: {text[:50]}... -> {e}")
        return [0.0] * 1536  # fallback in case of failure


# === 4. EMBED TRAINING TEXTS ===
print("ðŸ”„ Embedding training data...")
X_train_embedded = np.array([get_embedding(text) for text in tqdm(X_train_text)])


# === 5. TRAIN MODEL ===
print("ðŸ§  Training regression model...")
model = MultiOutputRegressor(Ridge())
model.fit(X_train_embedded, y_train)




ðŸ”„ Embedding training data...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5998/5998 [24:32<00:00,  4.07it/s]  


ðŸ§  Training regression model...


0,1,2
,estimator,Ridge()
,n_jobs,

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [7]:
# === 6. LOAD AND EMBED NEW ANOMALIES ===
df_new = pd.read_excel("test01wfi.xlsx")

df_new["text"] = (
    df_new["Systeme"].astype(str) + " | " +
    df_new["Description"].astype(str) + " | " +
    df_new["Description de l'Ã©quipement"].astype(str)
)

X_new_text = df_new["text"].tolist()

print("ðŸ”„ Embedding new anomaly data...")



ðŸ”„ Embedding new anomaly data...


In [8]:
X_new_embedded = np.array([get_embedding(text) for text in tqdm(X_new_text)])


# === 7. PREDICT AND FORMAT RESULTS ===
print("ðŸ§® Predicting scores...")
preds = model.predict(X_new_embedded)
preds_df = pd.DataFrame(preds, columns=["FiabilitÃ© IntÃ©gritÃ©", "DisponibiltÃ©", "Process Safety"])
preds_df = preds_df.round().clip(1, 5).astype(int)

# Combine with original input
df_result = pd.concat([df_new, preds_df], axis=1)

# === 8. SAVE TO FILE ===
output_file = "scored_anomalies_openai.xlsx"
df_result.to_excel(output_file, index=False)
print(f"âœ… Done! Results saved to {output_file}")
display(df_result.head(10))

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:02<00:00,  2.34it/s]

ðŸ§® Predicting scores...
âœ… Done! Results saved to scored_anomalies_openai.xlsx





Unnamed: 0,Num_equipement,Systeme,Description,Description de l'Ã©quipement,text,FiabilitÃ© IntÃ©gritÃ©,DisponibiltÃ©,Process Safety
0,3f68d058-b105-46f0-b137-6ce7695b85d1,e4f67f5f-be46-4ce0-ad14-68575d3cf0cd,apparition alarme 30bat10cf001 ( transfo princ...,TRANSFO PRINCIPAL,e4f67f5f-be46-4ce0-ad14-68575d3cf0cd | apparit...,2,4,1
1,3f68d058-b105-46f0-b137-6ce7695b85d1,e4f67f5f-be46-4ce0-ad14-68575d3cf0cd,SAFETY : Fuite par brideÂ radiateur du transfo...,TRANSFO PRINCIPAL,e4f67f5f-be46-4ce0-ad14-68575d3cf0cd | SAFETY ...,2,4,5
2,3f68d058-b105-46f0-b137-6ce7695b85d1,e4f67f5f-be46-4ce0-ad14-68575d3cf0cd,SAFETY : Fuite par brideÂ radiateur du transfo...,TRANSFO PRINCIPAL,e4f67f5f-be46-4ce0-ad14-68575d3cf0cd | SAFETY ...,2,4,5
3,c623e928-a195-4fea-9cc1-791e5e1f7988,52b20517-a1ba-4b78-8711-adebd336a6c2,SAFETY : PrÃ©sence des trace d'huile au niveau ...,Niveaustat cotÃ© Boite Ã Borne Alternateur,52b20517-a1ba-4b78-8711-adebd336a6c2 | SAFETY ...,2,5,4
4,7796ae6e-1bd3-499c-b920-7288f26e4f9a,52b20517-a1ba-4b78-8711-adebd336a6c2,SAFETY : dysfonctionnement de la rÃ©sistance de...,GROUPES ELECTROGENES U3,52b20517-a1ba-4b78-8711-adebd336a6c2 | SAFETY ...,2,3,3
