# 🧠 Quiet Team Pipeline
A complete AI pipeline to transcribe, diarize, analyze and graph multi-speaker audio.

## 🎧 Step 1: Convert MP3 to WAV

In [None]:
from pydub import AudioSegment

audio = AudioSegment.from_mp3(MP3_FILE)
audio = audio.set_channels(1).set_frame_rate(16000)
audio.export(WAV_FILE, format="wav")

## 🗣️ Step 2: Speaker diarization with pyannote.audio

In [None]:
from pyannote.audio import Pipeline
from collections import defaultdict

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token=HUGGINGFACE_TOKEN)
diarization = pipeline(WAV_FILE)

full_audio = AudioSegment.from_wav(WAV_FILE)
speaker_segments = defaultdict(list)

for turn, _, speaker in diarization.itertracks(yield_label=True):
    speaker_segments[speaker].append((turn.start, turn.end))

import os
os.makedirs("diarized_speakers", exist_ok=True)

for speaker, segments in speaker_segments.items():
    combined = AudioSegment.empty()
    for start, end in segments[:3]:
        combined += full_audio[start * 1000: end * 1000]
    out_path = f"diarized_speakers/{speaker}.wav"
    combined.export(out_path, format="wav")
    print(f"Exported {out_path}")

## 🧍 Step 3: Match diarized voices with known samples

In [None]:
from resemblyzer import VoiceEncoder, preprocess_wav
import numpy as np

encoder = VoiceEncoder()
reference_embeddings = {}

for file in ["Dylan.wav", "Sarah.wav"]:
    name = file.replace(".wav", "")
    wav = preprocess_wav(file)
    embedding = encoder.embed_utterance(wav)
    reference_embeddings[name] = embedding

diarized_wav = preprocess_wav("diarized_speakers/SPEAKER_00.wav")
diarized_embedding = encoder.embed_utterance(diarized_wav)

best_match = None
best_score = -1

for name, ref_emb in reference_embeddings.items():
    score = np.dot(diarized_embedding, ref_emb) / (np.linalg.norm(diarized_embedding) * np.linalg.norm(ref_emb))
    print(f"Similarity with {name}: {score:.3f}")
    if score > best_score:
        best_score = score
        best_match = name

print(f"🧍 Most likely match: {best_match} (score: {best_score:.3f})")

## ✍️ Step 4: Transcribe with Whisper

In [None]:
import whisper
model = whisper.load_model("base")

transcription_data = []
speaker_map = defaultdict(lambda: f"Speaker {len(speaker_map) + 1}")

for i, (turn, _, speaker_label) in enumerate(diarization.itertracks(yield_label=True)):
    segment = full_audio[turn.start * 1000: turn.end * 1000]
    segment_path = f"temp_segment_{i}.wav"
    segment.export(segment_path, format="wav")
    result = model.transcribe(segment_path, language="en")
    text = result["text"].strip()
    if not text:
        continue
    speaker_name = speaker_map[speaker_label]
    transcription_data.append({
        "segment": i,
        "start": round(turn.start, 2),
        "end": round(turn.end, 2),
        "speaker": speaker_name,
        "text": text
    })

with open("transcription_with_speakers.json", "w", encoding="utf-8") as f:
    json.dump(transcription_data, f, ensure_ascii=False, indent=2)

print("✅ Transcription saved.")

## 🧠 Step 5: Build a semantic graph from transcription


In [None]:
# === STEP 5: Build a semantic graph from transcription ===
model = SentenceTransformer('all-MiniLM-L6-v2')
G = nx.Graph()
nodes = []
embeddings = []

for i, segment in enumerate(speaker_data):
    node_id = f"{segment['speaker']}_msg{i}"
    text = segment['text']
    G.add_node(node_id, speaker=segment['speaker'], text=text)
    nodes.append((node_id, text))
    embeddings.append(model.encode(text))

sim_matrix = cosine_similarity(embeddings)
threshold = 0.6

for i in range(len(nodes)):
    for j in range(i + 1, len(nodes)):
        sim = sim_matrix[i][j]
        if sim > threshold:
            G.add_edge(nodes[i][0], nodes[j][0], weight=sim)

## 🔗 Step 6: Push graph to Neo4j

In [None]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def create_graph_in_neo4j(tx, G):
    tx.run("MATCH (n) DETACH DELETE n")
    for node_id, data in G.nodes(data=True):
        tx.run("""
            CREATE (:Phrase {id: $id, text: $text, speaker: $speaker})
        """, id=node_id, text=data["text"], speaker=data["speaker"])

    for source, target, data in G.edges(data=True):
        tx.run("""
            MATCH (a:Phrase {id: $id1})
            MATCH (b:Phrase {id: $id2})
            CREATE (a)-[:SIMILAR_TO {weight: $weight}]->(b)
        """, id1=source, id2=target, weight=data["weight"])

with driver.session() as session:
    session.write_transaction(create_graph_in_neo4j, G)
    print("✅ Graph successfully written to Neo4j!")

## 💬 Step 7: Query the knowledge graph with LangChain

In [None]:
docs = [
    Document(page_content=data["text"], metadata={"speaker": data["speaker"]})
    for _, data in G.nodes(data=True)
]

embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
vectorstore = FAISS.from_documents(docs, embedding_model)
retriever = vectorstore.as_retriever()

llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

query = "How could national economic interests have been protected without triggering global inflation and collateral damage?"
result = qa_chain(query)
print("📣 Answer:", result["result"])