<a href="https://colab.research.google.com/github/mmtaha/Projeto_Python_Transcri-o_de_Conversa_Whatssapp/blob/main/projeto_whatsapp_p1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, re, zipfile
import pandas as pd
from datetime import datetime
from docx import Document
from pydub import AudioSegment
import speech_recognition as sr

# --- Etapa 1: Ler o documento DOCX com a cronologia da conversa ---
docx_path = "/mnt/data/Historico_Conversas_Remilson_Com_Audios.docx"
doc = Document(docx_path)
text_content = [para.text for para in doc.paragraphs if para.text.strip()]

# --- Etapa 2: Extrair e organizar as mensagens em um DataFrame ---
messages = []
current_date = None
for line in text_content:
    # Se a linha inicia com uma data (ex: "02/02/2024 - Outros assuntos")
    if re.match(r"\d{2}/\d{2}/\d{4}", line):
        current_date = line.split(" - ")[0].strip()
    # Se a linha inicia com o horário em negrito (ex: "**08:53** - Remilson Marceneiro: ...")
    elif re.match(r"\*\*\d{2}:\d{2}\*\*", line):
        parts = line.split(" - ")
        if len(parts) >= 2:
            time = parts[0].replace("**", "").strip()
            # Se existir separador ": " para separar remetente e mensagem
            if ": " in " - ".join(parts[1:]):
                sender, message = " - ".join(parts[1:]).split(": ", 1)
            else:
                sender = "Desconhecido"
                message = " - ".join(parts[1:])
            timestamp_str = f"{current_date} {time}"
            try:
                timestamp = datetime.strptime(timestamp_str, "%d/%m/%Y %H:%M")
            except Exception as e:
                continue
            messages.append({"timestamp": timestamp, "sender": sender.strip(), "message": message.strip()})

df_messages = pd.DataFrame(messages)
df_messages = df_messages.sort_values(by="timestamp")

# --- Etapa 3: Identificar os arquivos de áudio (.opus) mencionados na conversa ---
audio_files_mentioned = df_messages[df_messages["message"].str.contains(".opus", na=False)]["message"].str.extract(r"(PTT-\d{8}-WA\d+\.opus)")[0].dropna().unique()

# --- Etapa 4: Verificar quais arquivos .opus estão disponíveis na pasta extraída ---
# A pasta onde o ZIP com os áudios foi extraído (ajuste se necessário)
extract_folder = "/mnt/data/chat_extracted"
available_opus_files = [f for f in os.listdir(extract_folder) if f.endswith(".opus")]
audio_files_to_process = list(set(audio_files_mentioned) & set(available_opus_files))
print("Áudios a processar:", audio_files_to_process)

# --- Etapa 5: Converter arquivos OPUS para WAV com fallback (tenta 'opus' e, se falhar, 'ogg') ---
wav_folder = os.path.join(extract_folder, "wav_files")
os.makedirs(wav_folder, exist_ok=True)

def convert_opus_to_wav(opus_path, wav_path):
    try:
        # Tenta ler como formato "opus"
        audio = AudioSegment.from_file(opus_path, format="opus")
    except Exception as e:
        print(f"Erro convertendo {opus_path} com format 'opus': {e}. Tentando 'ogg'...")
        try:
            audio = AudioSegment.from_file(opus_path, format="ogg")
        except Exception as e2:
            print(f"Erro convertendo {opus_path} com format 'ogg': {e2}")
            return None
    audio.export(wav_path, format="wav")
    return wav_path

audio_transcriptions = {}
for af in audio_files_to_process:
    opus_path = os.path.join(extract_folder, af)
    wav_path = os.path.join(wav_folder, af.replace(".opus", ".wav"))
    converted = convert_opus_to_wav(opus_path, wav_path)
    if converted:
        # --- Etapa 6: Transcrever o áudio usando speech_recognition (Google API, pt-BR) ---
        recognizer = sr.Recognizer()
        try:
            with sr.AudioFile(converted) as source:
                audio_data = recognizer.record(source)
                transcription = recognizer.recognize_google(audio_data, language="pt-BR")
                audio_transcriptions[af] = transcription
                print(f"Transcrito {af}: {transcription}")
        except Exception as e:
            print(f"Erro na transcrição de {af}: {e}")
            audio_transcriptions[af] = "[Erro na transcrição]"

# --- Etapa 7: Incorporar as transcrições nas mensagens ---
def append_transcription(message):
    match = re.search(r"(PTT-\d{8}-WA\d+\.opus)", message)
    if match:
        file_name = match.group(1)
        transcription = audio_transcriptions.get(file_name, "[Áudio não disponível]")
        return f"{message} - Transcrição: {transcription}"
    return message

df_messages["message"] = df_messages["message"].apply(append_transcription)

# --- Etapa 8: Salvar a conversa completa em um arquivo CSV ---
output_csv = "/mnt/data/conversa_transcrita.csv"
df_messages.to_csv(output_csv, index=False, encoding="utf-8-sig")
print(f"Conversas transcritas e salvas em: {output_csv}")


ModuleNotFoundError: No module named 'docx'