In [1]:
# library imports and loading dataset

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset (adjust path if needed)
df = pd.read_csv("production_grade_incident_rcas.csv")
df['date_reported'] = pd.to_datetime(df['date_reported'])

In [2]:
# Vectorize the RCA summaries

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['rca_summary'].astype(str))

In [3]:
# Check if Similar Incident Happened Before

def check_prior_occurrence(summary, date_reported, top_n=3):
    vec = vectorizer.transform([summary])
    similarities = cosine_similarity(vec, X).flatten()
    
    df_similar = df.copy()
    df_similar['similarity'] = similarities
    
    # Only look at incidents before the given one
    df_filtered = df_similar[df_similar['date_reported'] < pd.to_datetime(date_reported)]
    return df_filtered.sort_values(by='similarity', ascending=False).head(top_n)

In [6]:
# Usage

sample_summary = df.iloc[0]['rca_summary']
sample_data = df.iloc[0]['date_reported']
check_prior_occurrence(sample_summary, sample_data)

Unnamed: 0,incident_id,date_reported,app_name,client_name,impacted_clients,reported_by,ttd,ttk,ttm,ttc,rca_summary,rca_tags,team_owning,severity,category,incident_type,similarity
10,9f0aeab4-4d94-4361-a072-08a74010bb78,2025-01-25 05:05:00,App16,Client2,"Client3, Client8",client,21,4,9,174,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team5,1,Application,Stale Data Pipeline,0.835969
38,37ecc1c8-34ab-4adf-ae8a-a3cf29d83eba,2025-03-11 21:02:00,App8,Client9,"Client8, Client2, Client6, Client10",client,24,10,30,180,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team6,1,Application,Stale Data Pipeline,0.835969
28,db708472-df69-445a-bafc-fe40832adba9,2025-03-01 03:39:00,App9,Client2,Client7,client,22,3,10,59,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team12,1,Application,Stale Data Pipeline,0.835969


In [7]:
# Search RCAs by Keyword (e.g., schema, certificate, OAuth)

def find_rcas_by_keyword(keyword):
    keyword = keyword.lower()
    return df[
        df['rca_summary'].str.lower().str.contains(keyword) |
        df['rca_tags'].str.lower().str.contains(keyword) |
        df['incident_type'].str.lower().str.contains(keyword)
    ]

In [14]:
# usage
find_rcas_by_keyword('Kafka')

Unnamed: 0,incident_id,date_reported,app_name,client_name,impacted_clients,reported_by,ttd,ttk,ttm,ttc,rca_summary,rca_tags,team_owning,severity,category,incident_type
0,ad83d6d9-eae0-481c-99de-e13ca24e1594,2025-03-27 18:50:00,App29,Client1,"Client1, Client3, Client8",client,9,3,7,92,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team7,1,Application,Stale Data Pipeline
10,9f0aeab4-4d94-4361-a072-08a74010bb78,2025-01-25 05:05:00,App16,Client2,"Client3, Client8",client,21,4,9,174,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team5,1,Application,Stale Data Pipeline
17,82a15bc0-64d1-452c-bcdc-27248d5b4a77,2025-05-03 10:28:00,App25,Client8,"Client6, Client5, Client3",client,35,13,48,36,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team13,1,Application,Stale Data Pipeline
21,c3636fbd-5282-4d7d-b22a-4d8c9ff26b2b,2025-01-10 10:11:00,App20,Client10,Client1,client,3,11,69,158,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team4,1,Application,Stale Data Pipeline
28,db708472-df69-445a-bafc-fe40832adba9,2025-03-01 03:39:00,App9,Client2,Client7,client,22,3,10,59,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team12,1,Application,Stale Data Pipeline
31,779793e2-94cd-4ec8-b1ca-a69ba9d1e596,2025-05-18 13:14:00,App26,Client4,"Client5, Client8",client,37,19,27,38,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team10,1,Application,Stale Data Pipeline
33,2b80b764-d62b-4a04-88ab-e4c896254c4e,2025-05-10 11:01:00,App2,Client5,"Client3, Client9",client,6,19,33,80,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team3,1,Application,Stale Data Pipeline
38,37ecc1c8-34ab-4adf-ae8a-a3cf29d83eba,2025-03-11 21:02:00,App8,Client9,"Client8, Client2, Client6, Client10",client,24,10,30,180,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team6,1,Application,Stale Data Pipeline
44,352d8341-c853-41b9-a1a9-76731a9c421b,2025-02-05 12:58:00,App7,Client10,"Client6, Client9, Client7, Client1",client,21,10,47,29,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team1,1,Application,Stale Data Pipeline
49,82d63060-e0f5-4ab7-a41b-5cfd8e5f8730,2025-05-24 12:44:00,App14,Client10,"Client1, Client4",client,33,9,56,176,Kafka consumer in the pricing engine stalled d...,"Kafka, Schema, Consumer",Team11,1,Application,Stale Data Pipeline


In [11]:
# Frequency of a Keyword by Team

def frequency_by_team_for_keyword(keyword):
    keyword = keyword.lower()
    matched = df[
        df['rca_summary'].str.lower().str.contains(keyword) |
        df['rca_tags'].str.lower().str.contains(keyword) |
        df['incident_type'].str.lower().str.contains(keyword)
    ]
    return matched['team_owning'].value_counts()

In [12]:
# usage

frequency_by_team_for_keyword('certificate')

team_owning
Team13    3
Team5     2
Team4     1
Team7     1
Team8     1
Team12    1
Team2     1
Team6     1
Team3     1
Team10    1
Name: count, dtype: int64