**Case Study: Multi-Modal Anomaly Detection in Oil Rig Operations Using Time 
Series and Text Logs**

Detects anomalies in sensor data (like pressure, temperature, vibration).
Correlates anomalies with operator logs using NLP.
Uses GenAI to summarize insights (e.g., root causes).
Has a simple deployment (Streamlit / Flask / CLI).

In [134]:
#Creating a Sensor Time series Data

In [135]:
import pandas as pd
import numpy as np
import random
np.random.seed(42)

In [136]:
#Setting Equipment Types

eqp_list=['Pump_1','Pump_2','Pump_3']

start_date=pd.to_datetime("2025-03-05")
end_date=pd.to_datetime("2025-07-05")

In [137]:
print(start_date)
print(end_date)

2025-03-05 00:00:00
2025-07-05 00:00:00


In [138]:
data_date_range=pd.date_range(start=start_date,end=end_date,freq='H')

  data_date_range=pd.date_range(start=start_date,end=end_date,freq='H')


In [139]:
data_date_range

DatetimeIndex(['2025-03-05 00:00:00', '2025-03-05 01:00:00',
               '2025-03-05 02:00:00', '2025-03-05 03:00:00',
               '2025-03-05 04:00:00', '2025-03-05 05:00:00',
               '2025-03-05 06:00:00', '2025-03-05 07:00:00',
               '2025-03-05 08:00:00', '2025-03-05 09:00:00',
               ...
               '2025-07-04 15:00:00', '2025-07-04 16:00:00',
               '2025-07-04 17:00:00', '2025-07-04 18:00:00',
               '2025-07-04 19:00:00', '2025-07-04 20:00:00',
               '2025-07-04 21:00:00', '2025-07-04 22:00:00',
               '2025-07-04 23:00:00', '2025-07-05 00:00:00'],
              dtype='datetime64[ns]', length=2929, freq='h')

In [140]:
len(data_date_range)

2929

In [141]:
#Generating Sensor Data 

In [142]:
def generating_sensor_data(equipment):
    df_sensor=pd.DataFrame({'timestamp':data_date_range})
    df_sensor['equipment']=equipment
    df_sensor['temperature']=np.random.normal(loc=60,scale=5,size=len(data_date_range))
    df_sensor['pressure']=np.random.normal(loc=30,scale=3,size=len(data_date_range))
    df_sensor['vibration']=np.random.normal(loc=10,scale=1,size=len(data_date_range))
    added_noise_row=np.random.choice(len(df_sensor),size=int(0.01*len(df_sensor)),replace=False)
    df_sensor.loc[added_noise_row,'pressure']+=np.random.normal(20,10,size=len(added_noise_row))
    added_anaomoly_row=np.random.choice(len(df_sensor),size=int(0.05*len(df_sensor)),replace=False)
    df_sensor.loc[added_anaomoly_row,'temperature']+=np.random.choice([20,-10],size=len(added_anaomoly_row))
    df_sensor.loc[added_anaomoly_row,'vibration']+=np.random.choice([1,-1],size=len(added_anaomoly_row))
    missing_row=np.random.choice(len(df_sensor),size=int(0.01*len(df_sensor)),replace=False)
    df_sensor.loc[missing_row,'temperature']=np.nan
    return df_sensor

sensor_data=pd.concat([generating_sensor_data(eq) for eq in eqp_list])
sensor_data.reset_index(drop=True,inplace=True)

In [143]:
sensor_data

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration
0,2025-03-05 00:00:00,Pump_1,62.483571,32.956349,9.699020
1,2025-03-05 01:00:00,Pump_1,59.308678,32.607464,9.368563
2,2025-03-05 02:00:00,Pump_1,63.238443,28.633381,11.243406
3,2025-03-05 03:00:00,Pump_1,67.615149,27.332278,11.228030
4,2025-03-05 04:00:00,Pump_1,58.829233,32.865902,9.665197
...,...,...,...,...,...
8782,2025-07-04 20:00:00,Pump_3,64.269385,31.323782,10.117579
8783,2025-07-04 21:00:00,Pump_3,73.177892,32.154338,10.453580
8784,2025-07-04 22:00:00,Pump_3,67.557279,22.349323,10.527463
8785,2025-07-04 23:00:00,Pump_3,62.876377,28.701840,7.893789


In [144]:
sensor_data.isnull().sum()

timestamp       0
equipment       0
temperature    87
pressure        0
vibration       0
dtype: int64

In [145]:
#Preprocessing and Anaomoly Detection

In [146]:
sensor_data.sort_values(by='timestamp',inplace=True)

In [147]:
sensor_data

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration
0,2025-03-05 00:00:00,Pump_1,62.483571,32.956349,9.699020
2929,2025-03-05 00:00:00,Pump_2,54.022097,36.464754,10.823728
5858,2025-03-05 00:00:00,Pump_3,57.611866,32.831764,11.951233
1,2025-03-05 01:00:00,Pump_1,59.308678,32.607464,9.368563
2930,2025-03-05 01:00:00,Pump_2,66.670964,31.732248,8.551223
...,...,...,...,...,...
2927,2025-07-04 23:00:00,Pump_1,61.073566,23.329277,10.225257
5856,2025-07-04 23:00:00,Pump_2,61.385056,33.010095,11.172687
5857,2025-07-05 00:00:00,Pump_2,64.040531,30.559999,9.622943
2928,2025-07-05 00:00:00,Pump_1,67.890590,28.445793,10.629361


In [148]:
sensor_data[['temperature','pressure','vibration']]=sensor_data[['temperature','pressure','vibration']].ffill().bfill()

In [149]:
sensor_data.isnull().sum()

timestamp      0
equipment      0
temperature    0
pressure       0
vibration      0
dtype: int64

In [150]:
sensor_data

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration
0,2025-03-05 00:00:00,Pump_1,62.483571,32.956349,9.699020
2929,2025-03-05 00:00:00,Pump_2,54.022097,36.464754,10.823728
5858,2025-03-05 00:00:00,Pump_3,57.611866,32.831764,11.951233
1,2025-03-05 01:00:00,Pump_1,59.308678,32.607464,9.368563
2930,2025-03-05 01:00:00,Pump_2,66.670964,31.732248,8.551223
...,...,...,...,...,...
2927,2025-07-04 23:00:00,Pump_1,61.073566,23.329277,10.225257
5856,2025-07-04 23:00:00,Pump_2,61.385056,33.010095,11.172687
5857,2025-07-05 00:00:00,Pump_2,64.040531,30.559999,9.622943
2928,2025-07-05 00:00:00,Pump_1,67.890590,28.445793,10.629361


In [151]:
# Scailing of the Data

In [152]:
from sklearn.preprocessing import StandardScaler
sscaler=StandardScaler()
sensor_data[['scaled_temp','scaled_pressure','scaled_vibration']]=sscaler.fit_transform(sensor_data[['temperature','pressure','vibration']])

In [153]:
sensor_data

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration,scaled_temp,scaled_pressure,scaled_vibration
0,2025-03-05 00:00:00,Pump_1,62.483571,32.956349,9.699020,0.379944,0.746796,-0.304795
2929,2025-03-05 00:00:00,Pump_2,54.022097,36.464754,10.823728,-1.031213,1.680681,0.781474
5858,2025-03-05 00:00:00,Pump_3,57.611866,32.831764,11.951233,-0.432531,0.713634,1.870445
1,2025-03-05 01:00:00,Pump_1,59.308678,32.607464,9.368563,-0.149547,0.653928,-0.623958
2930,2025-03-05 01:00:00,Pump_2,66.670964,31.732248,8.551223,1.078295,0.420959,-1.413364
...,...,...,...,...,...,...,...,...
2927,2025-07-04 23:00:00,Pump_1,61.073566,23.329277,10.225257,0.144792,-1.815786,0.203457
5856,2025-07-04 23:00:00,Pump_2,61.385056,33.010095,11.172687,0.196740,0.761103,1.118507
5857,2025-07-05 00:00:00,Pump_2,64.040531,30.559999,9.622943,0.639606,0.108924,-0.378272
2928,2025-07-05 00:00:00,Pump_1,67.890590,28.445793,10.629361,1.281697,-0.453846,0.593750


In [154]:
#Isolation Forest (to detect Anaomolies)

In [155]:
from sklearn.ensemble import IsolationForest

features=sensor_data[['scaled_temp','scaled_pressure','scaled_vibration']]

model=IsolationForest(contamination=0.01,random_state=42)
sensor_data['anamoly_score']=model.fit_predict(features)

In [156]:
sensor_data

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration,scaled_temp,scaled_pressure,scaled_vibration,anamoly_score
0,2025-03-05 00:00:00,Pump_1,62.483571,32.956349,9.699020,0.379944,0.746796,-0.304795,1
2929,2025-03-05 00:00:00,Pump_2,54.022097,36.464754,10.823728,-1.031213,1.680681,0.781474,1
5858,2025-03-05 00:00:00,Pump_3,57.611866,32.831764,11.951233,-0.432531,0.713634,1.870445,1
1,2025-03-05 01:00:00,Pump_1,59.308678,32.607464,9.368563,-0.149547,0.653928,-0.623958,1
2930,2025-03-05 01:00:00,Pump_2,66.670964,31.732248,8.551223,1.078295,0.420959,-1.413364,1
...,...,...,...,...,...,...,...,...,...
2927,2025-07-04 23:00:00,Pump_1,61.073566,23.329277,10.225257,0.144792,-1.815786,0.203457,1
5856,2025-07-04 23:00:00,Pump_2,61.385056,33.010095,11.172687,0.196740,0.761103,1.118507,1
5857,2025-07-05 00:00:00,Pump_2,64.040531,30.559999,9.622943,0.639606,0.108924,-0.378272,1
2928,2025-07-05 00:00:00,Pump_1,67.890590,28.445793,10.629361,1.281697,-0.453846,0.593750,1


In [157]:
sensor_data['anamoly_score'].value_counts()

anamoly_score
 1    8699
-1      88
Name: count, dtype: int64

In [158]:
sensor_data['anamoly_labelling']=sensor_data['anamoly_score'].map({1:'Normal',-1:'Anamoly'})

In [159]:
sensor_data

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration,scaled_temp,scaled_pressure,scaled_vibration,anamoly_score,anamoly_labelling
0,2025-03-05 00:00:00,Pump_1,62.483571,32.956349,9.699020,0.379944,0.746796,-0.304795,1,Normal
2929,2025-03-05 00:00:00,Pump_2,54.022097,36.464754,10.823728,-1.031213,1.680681,0.781474,1,Normal
5858,2025-03-05 00:00:00,Pump_3,57.611866,32.831764,11.951233,-0.432531,0.713634,1.870445,1,Normal
1,2025-03-05 01:00:00,Pump_1,59.308678,32.607464,9.368563,-0.149547,0.653928,-0.623958,1,Normal
2930,2025-03-05 01:00:00,Pump_2,66.670964,31.732248,8.551223,1.078295,0.420959,-1.413364,1,Normal
...,...,...,...,...,...,...,...,...,...,...
2927,2025-07-04 23:00:00,Pump_1,61.073566,23.329277,10.225257,0.144792,-1.815786,0.203457,1,Normal
5856,2025-07-04 23:00:00,Pump_2,61.385056,33.010095,11.172687,0.196740,0.761103,1.118507,1,Normal
5857,2025-07-05 00:00:00,Pump_2,64.040531,30.559999,9.622943,0.639606,0.108924,-0.378272,1,Normal
2928,2025-07-05 00:00:00,Pump_1,67.890590,28.445793,10.629361,1.281697,-0.453846,0.593750,1,Normal


In [160]:
#sensor_data.drop(['temperature','pressure','vibration'],axis=1,inplace=True)

In [161]:
sensor_data

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration,scaled_temp,scaled_pressure,scaled_vibration,anamoly_score,anamoly_labelling
0,2025-03-05 00:00:00,Pump_1,62.483571,32.956349,9.699020,0.379944,0.746796,-0.304795,1,Normal
2929,2025-03-05 00:00:00,Pump_2,54.022097,36.464754,10.823728,-1.031213,1.680681,0.781474,1,Normal
5858,2025-03-05 00:00:00,Pump_3,57.611866,32.831764,11.951233,-0.432531,0.713634,1.870445,1,Normal
1,2025-03-05 01:00:00,Pump_1,59.308678,32.607464,9.368563,-0.149547,0.653928,-0.623958,1,Normal
2930,2025-03-05 01:00:00,Pump_2,66.670964,31.732248,8.551223,1.078295,0.420959,-1.413364,1,Normal
...,...,...,...,...,...,...,...,...,...,...
2927,2025-07-04 23:00:00,Pump_1,61.073566,23.329277,10.225257,0.144792,-1.815786,0.203457,1,Normal
5856,2025-07-04 23:00:00,Pump_2,61.385056,33.010095,11.172687,0.196740,0.761103,1.118507,1,Normal
5857,2025-07-05 00:00:00,Pump_2,64.040531,30.559999,9.622943,0.639606,0.108924,-0.378272,1,Normal
2928,2025-07-05 00:00:00,Pump_1,67.890590,28.445793,10.629361,1.281697,-0.453846,0.593750,1,Normal


In [162]:
sensor_data[sensor_data['anamoly_labelling']=='Anamoly']

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration,scaled_temp,scaled_pressure,scaled_vibration,anamoly_score,anamoly_labelling
5870,2025-03-05 12:00:00,Pump_3,82.013841,38.537889,11.853670,3.637095,2.232519,1.776216,-1,Anamoly
35,2025-03-06 11:00:00,Pump_1,53.895782,67.009231,8.479713,-1.052279,9.811165,-1.482430,-1,Anamoly
2977,2025-03-07 00:00:00,Pump_2,61.107522,55.327855,10.280432,0.150455,6.701757,0.256746,-1,Anamoly
3004,2025-03-08 03:00:00,Pump_2,57.345288,62.551585,10.509483,-0.476990,8.624606,0.477969,-1,Anamoly
3093,2025-03-11 20:00:00,Pump_2,67.276777,60.711829,12.314839,1.179329,8.134890,2.221624,-1,Anamoly
...,...,...,...,...,...,...,...,...,...,...
8655,2025-06-29 13:00:00,Pump_3,81.315054,30.371542,12.929308,3.520555,0.058760,2.815092,-1,Anamoly
2824,2025-06-30 16:00:00,Pump_1,79.737978,25.932089,7.312721,3.257539,-1.122957,-2.609538,-1,Anamoly
8714,2025-07-02 00:00:00,Pump_3,60.936358,58.098349,9.194521,0.121909,7.439221,-0.792052,-1,Anamoly
2880,2025-07-03 00:00:00,Pump_1,48.808844,56.946013,11.478121,-1.900651,7.132486,1.413502,-1,Anamoly


In [163]:
#NLP-based Correlation between Sensor Anomalies and Operator Logs

In [164]:
#Getting Sensor Anomolies

In [165]:
sensor_data[sensor_data['anamoly_labelling']=='Anamoly'][['timestamp','equipment','temperature','pressure','vibration']]

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration
5870,2025-03-05 12:00:00,Pump_3,82.013841,38.537889,11.853670
35,2025-03-06 11:00:00,Pump_1,53.895782,67.009231,8.479713
2977,2025-03-07 00:00:00,Pump_2,61.107522,55.327855,10.280432
3004,2025-03-08 03:00:00,Pump_2,57.345288,62.551585,10.509483
3093,2025-03-11 20:00:00,Pump_2,67.276777,60.711829,12.314839
...,...,...,...,...,...
8655,2025-06-29 13:00:00,Pump_3,81.315054,30.371542,12.929308
2824,2025-06-30 16:00:00,Pump_1,79.737978,25.932089,7.312721
8714,2025-07-02 00:00:00,Pump_3,60.936358,58.098349,9.194521
2880,2025-07-03 00:00:00,Pump_1,48.808844,56.946013,11.478121


In [166]:
anamolies=sensor_data[sensor_data['anamoly_labelling']=='Anamoly'][['timestamp','equipment','temperature','pressure','vibration']]

In [167]:
anamolies

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration
5870,2025-03-05 12:00:00,Pump_3,82.013841,38.537889,11.853670
35,2025-03-06 11:00:00,Pump_1,53.895782,67.009231,8.479713
2977,2025-03-07 00:00:00,Pump_2,61.107522,55.327855,10.280432
3004,2025-03-08 03:00:00,Pump_2,57.345288,62.551585,10.509483
3093,2025-03-11 20:00:00,Pump_2,67.276777,60.711829,12.314839
...,...,...,...,...,...
8655,2025-06-29 13:00:00,Pump_3,81.315054,30.371542,12.929308
2824,2025-06-30 16:00:00,Pump_1,79.737978,25.932089,7.312721
8714,2025-07-02 00:00:00,Pump_3,60.936358,58.098349,9.194521
2880,2025-07-03 00:00:00,Pump_1,48.808844,56.946013,11.478121


In [168]:
anamolies.reset_index(drop=True,inplace=True)

In [169]:
anamolies

Unnamed: 0,timestamp,equipment,temperature,pressure,vibration
0,2025-03-05 12:00:00,Pump_3,82.013841,38.537889,11.853670
1,2025-03-06 11:00:00,Pump_1,53.895782,67.009231,8.479713
2,2025-03-07 00:00:00,Pump_2,61.107522,55.327855,10.280432
3,2025-03-08 03:00:00,Pump_2,57.345288,62.551585,10.509483
4,2025-03-11 20:00:00,Pump_2,67.276777,60.711829,12.314839
...,...,...,...,...,...
83,2025-06-29 13:00:00,Pump_3,81.315054,30.371542,12.929308
84,2025-06-30 16:00:00,Pump_1,79.737978,25.932089,7.312721
85,2025-07-02 00:00:00,Pump_3,60.936358,58.098349,9.194521
86,2025-07-03 00:00:00,Pump_1,48.808844,56.946013,11.478121


In [170]:
# Extract timestamps of anomaly rows to help generate meaningful logs
anomaly_times = sensor_data[sensor_data['anamoly_labelling'] == 'Anamoly']['timestamp'].tolist()
log_time = np.random.choice(sensor_data['timestamp'])


In [171]:
anomaly_times

[Timestamp('2025-03-05 12:00:00'),
 Timestamp('2025-03-06 11:00:00'),
 Timestamp('2025-03-07 00:00:00'),
 Timestamp('2025-03-08 03:00:00'),
 Timestamp('2025-03-11 20:00:00'),
 Timestamp('2025-03-12 20:00:00'),
 Timestamp('2025-03-15 08:00:00'),
 Timestamp('2025-03-16 11:00:00'),
 Timestamp('2025-03-18 14:00:00'),
 Timestamp('2025-03-19 14:00:00'),
 Timestamp('2025-03-20 17:00:00'),
 Timestamp('2025-03-23 22:00:00'),
 Timestamp('2025-03-25 12:00:00'),
 Timestamp('2025-03-25 21:00:00'),
 Timestamp('2025-03-26 02:00:00'),
 Timestamp('2025-03-27 07:00:00'),
 Timestamp('2025-03-28 05:00:00'),
 Timestamp('2025-03-28 14:00:00'),
 Timestamp('2025-03-30 14:00:00'),
 Timestamp('2025-04-01 23:00:00'),
 Timestamp('2025-04-02 17:00:00'),
 Timestamp('2025-04-04 17:00:00'),
 Timestamp('2025-04-05 19:00:00'),
 Timestamp('2025-04-07 20:00:00'),
 Timestamp('2025-04-07 22:00:00'),
 Timestamp('2025-04-09 16:00:00'),
 Timestamp('2025-04-09 16:00:00'),
 Timestamp('2025-04-10 04:00:00'),
 Timestamp('2025-04-

In [172]:
len(anomaly_times)

88

In [173]:
#Creating Operator Logs

log_msgs = [
    "High temperature observed on {}",
    "Temperature spike detected in {}",
    "Unusual temperature seen on {}",
    "High pressure alert for {}",
    "Pressure exceeds threshold in {}",
    "Worrying Vibration in {}",
    "Vibration spike detected in {}",
    "Unusual vibration noted in {}",
    "Low pressure in {}",
    "Pressure fluctuation in {}",
    "Low temperature in {}",
    "Sudden drop in temperature at {}",
    "Maintenance required for {}",
    "Operator noticed issue with {}"
]


In [174]:
logdata = []
for anomaly in anamolies.to_dict('records'):
    for _ in range(3):  # Generate 3 logs per anomaly
        log_time = anomaly['timestamp'] + pd.Timedelta(hours=random.randint(-6, 6))
        eqp = anomaly['equipment']
        operator_msg = random.choice(log_msgs).format(eqp)
        logdata.append({'timestamp': log_time, 'equipment': eqp, 'log_msg': operator_msg})

log_df = pd.DataFrame(logdata)


In [175]:
log_df.head()

Unnamed: 0,timestamp,equipment,log_msg
0,2025-03-05 17:00:00,Pump_3,Sudden drop in temperature at Pump_3
1,2025-03-05 10:00:00,Pump_3,Temperature spike detected in Pump_3
2,2025-03-05 09:00:00,Pump_3,Operator noticed issue with Pump_3
3,2025-03-06 12:00:00,Pump_1,Low temperature in Pump_1
4,2025-03-06 10:00:00,Pump_1,Temperature spike detected in Pump_1


In [176]:
log_df.shape

(264, 3)

In [177]:
#!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load a lightweight transformer
model = SentenceTransformer('all-MiniLM-L6-v2')


In [178]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [179]:
# Create embeddings for operator log messag
log_df['embedding'] = log_df['log_msg'].apply(lambda x: model.encode(x))

In [180]:
log_df

Unnamed: 0,timestamp,equipment,log_msg,embedding
0,2025-03-05 17:00:00,Pump_3,Sudden drop in temperature at Pump_3,"[-0.051135138, -0.04821866, 0.030568942, 0.035..."
1,2025-03-05 10:00:00,Pump_3,Temperature spike detected in Pump_3,"[-0.054569207, -0.073973976, -0.016031677, 0.0..."
2,2025-03-05 09:00:00,Pump_3,Operator noticed issue with Pump_3,"[-0.029597133, -0.01674145, 0.007828624, -0.03..."
3,2025-03-06 12:00:00,Pump_1,Low temperature in Pump_1,"[-0.035165083, -0.02050417, -0.021830127, 0.01..."
4,2025-03-06 10:00:00,Pump_1,Temperature spike detected in Pump_1,"[-0.06608388, -0.062881485, 0.008416745, 0.063..."
...,...,...,...,...
259,2025-07-03 02:00:00,Pump_1,Unusual temperature seen on Pump_1,"[-0.08110145, -0.048349418, 0.005528392, 0.056..."
260,2025-07-02 19:00:00,Pump_1,Pressure fluctuation in Pump_1,"[-0.06503875, -0.038051374, 0.016781477, -0.01..."
261,2025-07-04 11:00:00,Pump_3,High temperature observed on Pump_3,"[-0.059865147, -0.06306548, -0.054698203, 0.00..."
262,2025-07-04 08:00:00,Pump_3,Unusual temperature seen on Pump_3,"[-0.067855746, -0.066485904, -0.027610615, 0.0..."


In [181]:
def match_log(row):
    # Convert anomaly sensor readings into a short descriptive sentence
    anomaly_text = f"Temperature {row.temperature}, Pressure {row.pressure}, Vibration {row.vibration}"
    anomaly_vec = model.encode(anomaly_text)

    # Filter logs with same equipment and within ±6 hours
    nearby_logs = log_df[
        (log_df['equipment'] == row.equipment) &
        (abs(log_df['timestamp'] - row.timestamp) <= pd.Timedelta(hours=6))
    ].copy()

    if not nearby_logs.empty:
        # Calculate cosine similarity between anomaly and each log message
        nearby_logs['similarity'] = cosine_similarity([anomaly_vec], list(nearby_logs['embedding']))[0]
        top = nearby_logs.sort_values(by='similarity', ascending=False).head(1)
    else:
        # Fallback: if no nearby logs found, take the most recent one for same equipment
        fallback = log_df[log_df['equipment'] == row.equipment].sort_values(by='timestamp', ascending=False).head(1)
        if fallback.empty:
            return None  # No match at all
        fallback['similarity'] = 0.0  # fallback match is arbitrary
        top = fallback

    return {
        'anomaly_time': row.timestamp,
        'equipment': row.equipment,
        'log_time': top.iloc[0]['timestamp'],
        'log_message': top.iloc[0]['log_msg'],
        'similarity': top.iloc[0]['similarity']
    }


In [182]:
# Apply the matching function to all anomalies
matched = anamolies.apply(match_log, axis=1).dropna()

# Convert list of dictionaries to DataFrame
matched_df = pd.DataFrame(matched.tolist())
matched_df[['anomaly_time', 'equipment', 'log_time', 'log_message', 'similarity']].head(20)



Unnamed: 0,anomaly_time,equipment,log_time,log_message,similarity
0,2025-03-05 12:00:00,Pump_3,2025-03-05 17:00:00,Sudden drop in temperature at Pump_3,0.290366
1,2025-03-06 11:00:00,Pump_1,2025-03-06 16:00:00,Unusual vibration noted in Pump_1,0.379861
2,2025-03-07 00:00:00,Pump_2,2025-03-07 03:00:00,Unusual temperature seen on Pump_2,0.363343
3,2025-03-08 03:00:00,Pump_2,2025-03-07 21:00:00,Vibration spike detected in Pump_2,0.252364
4,2025-03-11 20:00:00,Pump_2,2025-03-11 17:00:00,Vibration spike detected in Pump_2,0.220833
5,2025-03-12 20:00:00,Pump_3,2025-03-13 00:00:00,Low temperature in Pump_3,0.351828
6,2025-03-15 08:00:00,Pump_2,2025-03-15 05:00:00,Low temperature in Pump_2,0.309209
7,2025-03-16 11:00:00,Pump_3,2025-03-16 13:00:00,Unusual temperature seen on Pump_3,0.372462
8,2025-03-18 14:00:00,Pump_1,2025-03-18 18:00:00,Vibration spike detected in Pump_1,0.27759
9,2025-03-19 14:00:00,Pump_1,2025-03-19 16:00:00,Sudden drop in temperature at Pump_1,0.252895


In [183]:
print("Anomalies count:", len(anamolies))
print("Logs count:", len(log_df))

Anomalies count: 88
Logs count: 264


In [185]:
#Initialize Open Ai
from dotenv import load_dotenv

In [186]:
from dotenv import load_dotenv
load_dotenv()


True

In [None]:
# Required Imports

import pandas as pd
import os
from langchain_core.messages import SystemMessage, HumanMessage, ToolMessage


# Instantiate the Chat Model
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)

# ✅ Define LLM summarization function
def gpt_summary(msg):
    try:
        prompt = f"Summarize the following operator log in one short sentence:\n'{msg}'"
        response = llm.invoke([HumanMessage(content=prompt)])
        return response.content.strip()
    except Exception as e:
        print("Error:", e)
        return "Summary failed"

# ✅ Apply summarization on log messages
matched_df['summary'] = matched_df['log_message'].apply(gpt_summary)

# ✅ Show summarized results
matched_df[['anomaly_time', 'equipment', 'log_message', 'summary']].head(10)


Unnamed: 0,anomaly_time,equipment,log_message,summary
0,2025-03-05 12:00:00,Pump_3,Sudden drop in temperature at Pump_3,Pump_3 experienced a sudden decrease in temper...
1,2025-03-06 11:00:00,Pump_1,Unusual vibration noted in Pump_1,Pump_1 is experiencing unusual vibrations.
2,2025-03-07 00:00:00,Pump_2,Unusual temperature seen on Pump_2,Pump_2 is experiencing an abnormal temperature...
3,2025-03-08 03:00:00,Pump_2,Vibration spike detected in Pump_2,An abnormal vibration spike was detected in Pu...
4,2025-03-11 20:00:00,Pump_2,Vibration spike detected in Pump_2,An abnormal increase in vibration was detected...
5,2025-03-12 20:00:00,Pump_3,Low temperature in Pump_3,Pump_3 is experiencing low temperature.
6,2025-03-15 08:00:00,Pump_2,Low temperature in Pump_2,Pump_2 is experiencing low temperature.
7,2025-03-16 11:00:00,Pump_3,Unusual temperature seen on Pump_3,Anomalous temperature detected on Pump_3.
8,2025-03-18 14:00:00,Pump_1,Vibration spike detected in Pump_1,An abnormal increase in vibration was detected...
9,2025-03-19 14:00:00,Pump_1,Sudden drop in temperature at Pump_1,There was a sudden decrease in temperature at ...


In [188]:
matched_df.to_csv("matched_anomalies_with_logs.csv", index=False)