# Función final para identificación del Liker

In [78]:
# Instalacionde requerimientos en caso de ser necesario
# pip install -r requirements.txt 

In [1]:
# Carga de librerias necesarias
# Importacion de librerias
import os
import spacy
import pandas as pd
import numpy as np
from spacy.matcher import Matcher

# Carga del modelo
spacy.require_cpu() # Temporal mientras se arregla problema con uso de GPU
nlp = spacy.load("es_core_news_lg")

In [83]:
# La función ya trae por defecto un conjunto de patrones para identificar el Liker pero solo es válido para la campaña de SURA
def Speaker_Asignation(file_path, additional_patterns = [], write_txt = True, keep_strategies = True, Word_Count = False, Num_Matches = False, install_requirements = False):
    """
    file_path: path of the transcripted .txt file without the Speaker asignation (i.e each file has either SPEAKER_00 or SPEAKER_01)
    write_txt: Specify if you want to create a new .txt file with the asigned roles by the function
    additional_patterns: List or dict. Additional patterns to be used in the third strategy (those one not considered or belong to other campaign).
    keep_strategies: Include columns of the asigned role in each strategy
    Word_Count: Include the number of words spoken by speaker in each line
    Num_Matches: Include the number of matches in the considered patterns to search
    """
    # Lectura del archivo de transcripcion
    with open(file_path, "r", encoding = "utf-8") as archivo:
        aux_list_df = [] # Lista auxiliar para creacion del df
        # Itera sobre cada línea del archivo
        for linea in archivo:
            # Creacion del dataframe con la transcripcion
            transcript = [linea[:17].strip(), linea[19:29].strip(), linea[31:].strip()]
            aux_list_df.append(transcript)
            
        # Creacion del dataframe con la transcripcion
        transcript_df = pd.DataFrame(aux_list_df, columns = ["Tiempo", "Speaker", "Texto"])
    
    # Solucion 1: Liker = Primer speaker
    transcript_df["Speaker_Asignado_Opt_1"] = np.where(transcript_df.Speaker == transcript_df.Speaker[0], "Liker", "Cliente")

    # Solucion 2: Liker =  Persona que mas habla
    transcript_df["Word_Count"] = transcript_df.Texto.apply(lambda x: len(x.split())) # Columna auxiliar para contar el numero de palabras en cada fila

    # Group by para identificar el Speaker que más habla
    words_per_speaker = transcript_df[["Speaker", "Word_Count"]].groupby(by = "Speaker").sum().reset_index()

    # Asignacion del speaker segun que tanto habla
    # Lista booleana con el respectivo mapeo
    mapping_opt_2 = [words_per_speaker[words_per_speaker.Word_Count == words_per_speaker.Word_Count.max()].Speaker == transcript_df.Speaker[i] for i in range(len(transcript_df.Speaker))]
    transcript_df["Speaker_Asignado_Opt_2"] = np.where(mapping_opt_2, "Liker", "Cliente")

    # Eliminacion de columna auxiliar (Eliminar linea de abajo en caso de querer conservarla)
    if not Word_Count:
        transcript_df.drop(columns = "Word_Count", inplace = True)

    # Solucion 3: Conteo del numero de ocurrencias de patrones particulares
    # Creacion del matcher para buscar patrones
    matcher = Matcher(nlp.vocab) # Matcher
    sura_patterns = [[{"LOWER": "sudamericana"}], [{"LOWER": "de"}, {"LOWER": "sudamericana"}],
                    [{"LOWER": {"REGEX": r'sura'}}], [{"LOWER": "de"}, {"LOWER": {"REGEX": r'sura'}}],
                    [{"LOWER": {"REGEX": r'seguro'}}], [{"LOWER": {"REGEX": r'seguro'}}, {"LOWER": {"REGEX": r'sura'}}],
                    [{"LOWER": "póliza"}], [{"LOWER": "poliza"}], [{"LOWER": {"REGEX": "cotización"}}],
                    [{"LOWER": {"REGEX": r'asegura'}}],
                    [{"LOWER": "grabada"}], [{"LOWER": "monitoreada"}], [{"LOWER": "siendo"}, {"LOWER": "grabada"}],
                    [{"LOWER": "siendo"}, {"LOWER": "grabada"}, {"LOWER": "y"}, {"LOWER": "monitoreada"}],
                    [{"LOWER": "validar"}, {"LOWER": "datos"}]]
    matcher.add("sura_patterns", sura_patterns)
    
    # Agregacion de patrone adicionales dados por el usuario
    if len(additional_patterns) != 0:
        if isinstance(additional_patterns, list):
            additional_patterns_lower = [token.lower() for token in additional_patterns]
            patterns = [[{"LOWER": token} for token in item.split()] for item in additional_patterns_lower]
            matcher.add("additional_patterns", patterns)
        elif isinstance(additional_patterns, dict):
             matcher.add(additional_patterns)

    # Conteo de cuantos patrones encuentra por speaker
    transcript_df["Num_Matches"] = transcript_df.Texto.apply(lambda x: len(matcher(nlp(x))))

    # Group by para identificar el Speaker que mas patrones repite
    num_matches = transcript_df[["Speaker", "Num_Matches"]].groupby(by = "Speaker").sum().reset_index().sort_values("Num_Matches", ascending = False)
    
    # Verificar posible empate
    tie = len(num_matches['Num_Matches'].unique()) == 1

    if not tie:
        # Asignacion del speaker segun numero de matches
        # Lista booleana con el respectivo mapeo
        mapping_opt_3 = [num_matches[num_matches.Num_Matches == num_matches.Num_Matches.max()].Speaker == transcript_df.Speaker[i] for i in range(len(transcript_df.Speaker))]
        transcript_df["Speaker_Asignado_Opt_3"] = np.where(mapping_opt_3, "Liker", "Cliente")
    else:
        transcript_df["Speaker_Asignado_Opt_3"] = "Empate"

    if not Num_Matches:
            transcript_df.drop(columns = "Num_Matches", inplace = True)
    
    # SI HAY EMPATE, SE LE DA PRIORIDAD A LA SEGUNDA ESTRATEGGIA (PERSONA QUE MAS HABLA)
    if not tie:
        # Asignacion final del speaker
        bool_df = transcript_df[["Speaker_Asignado_Opt_1", "Speaker_Asignado_Opt_2", "Speaker_Asignado_Opt_3"]].apply(lambda x: x == "Liker")
        transcript_df["Speaker_Asignado"] = np.where(np.sum(bool_df, axis = 1).between(2, 3), "[Liker  ]:", "[Cliente]:")
    else:
        transcript_df["Speaker_Asignado"] = transcript_df["Speaker_Asignado_Opt_2"]    
    
    # Elimnacion de columnas para asignacion final (en caso de requerirse)
    if not keep_strategies:
        if not tie:
            transcript_df.drop(columns = ["Speaker_Asignado_Opt_1", "Speaker_Asignado_Opt_2", "Speaker_Asignado_Opt_3"], inplace = True)
        else:
            transcript_df.drop(columns = ["Speaker_Asignado_Opt_1", "Speaker_Asignado_Opt_2"], inplace = True)
            
    # Escribir la transcripción en un archivo txt
    if write_txt:
        # Lista para   
        to_write_list = transcript_df[["Tiempo", "Speaker_Asignado", "Texto"]].apply(lambda x: " ".join(x.astype(str)), axis = 1)
        # Abrir el archivo en modo de escritura
        with open("Identified_Speakers\\" + "Asigned_Speaker_" + os.path.basename(file_path), 'w', encoding = "utf-8") as archivo:
            # Iterar sobre la lista y escribir cada texto en una nueva línea
            for texto in to_write_list:
                archivo.write(texto + '\n')

    return {"Transcripted_Df": transcript_df, "Word_Count": words_per_speaker, "Num_Matches": num_matches}

In [4]:
os.listdir("Raw_Transcriptions/")

['1001764369_1_transcription.txt',
 '1014249230_3_transcription.txt',
 '1129574339_6_transcription.txt',
 '14838701_2_transcription.txt',
 'force-44441940-972-20240401-112258-1711988578.150759.txt',
 'out-3003958667-994-20240405-163905-1712353145.162469.txt',
 'out-3054535186-936-20240401-115437-1711990477.150948.txt',
 'out-3103166473-973-20240404-180947-1712272187.160933.txt',
 'out-3103465155-956-20240401-132257-1711995777.151309.txt',
 'out-3103675012-936-20240404-095452-1712242492.158691.txt',
 'out-3103723286-956-20240405-135755-1712343475.161940.txt',
 'out-3103814989-913-20240404-161134-1712265094.160513.txt',
 'out-3103814989-913-20240405-175919-1712357959.162644.txt',
 'out-3103894985-900-20240405-151650-1712348210.162208.txt',
 'out-3104294636-980-20240403-162248-1712179368.157746.txt',
 'out-3104493260-900-20240402-181030-1712099430.155354.txt',
 'out-3104560865-913-20240405-084247-1712324567.161074.txt',
 'out-3104875598-900-20240402-141516-1712085316.154263.txt',
 'out-31

# Asignación de roles en las transcripciones crudas

In [70]:
os.listdir("Raw_Transcriptions/")

['1001764369_1_transcription.txt',
 '1014249230_3_transcription.txt',
 '1129574339_6_transcription.txt',
 '14838701_2_transcription.txt',
 'force-44441940-972-20240401-112258-1711988578.150759.txt',
 'out-3003958667-994-20240405-163905-1712353145.162469.txt',
 'out-3054535186-936-20240401-115437-1711990477.150948.txt',
 'out-3103166473-973-20240404-180947-1712272187.160933.txt',
 'out-3103465155-956-20240401-132257-1711995777.151309.txt',
 'out-3103675012-936-20240404-095452-1712242492.158691.txt',
 'out-3103723286-956-20240405-135755-1712343475.161940.txt',
 'out-3103814989-913-20240404-161134-1712265094.160513.txt',
 'out-3103814989-913-20240405-175919-1712357959.162644.txt',
 'out-3103894985-900-20240405-151650-1712348210.162208.txt',
 'out-3104294636-980-20240403-162248-1712179368.157746.txt',
 'out-3104493260-900-20240402-181030-1712099430.155354.txt',
 'out-3104560865-913-20240405-084247-1712324567.161074.txt',
 'out-3104875598-900-20240402-141516-1712085316.154263.txt',
 'out-31

In [82]:
Speaker_Asignation("Raw_Transcriptions/out-3103465155-956-20240401-132257-1711995777.151309.txt")

{'Transcripted_Df':                Tiempo     Speaker  \
 0   0:00:12 - 0:00:12  SPEAKER_01   
 1   0:00:12 - 0:00:16  SPEAKER_00   
 2   0:00:16 - 0:00:17  SPEAKER_01   
 3   0:00:17 - 0:00:20  SPEAKER_00   
 4   0:00:21 - 0:00:22  SPEAKER_01   
 5   0:00:22 - 0:00:33  SPEAKER_00   
 6   0:00:33 - 0:00:34  SPEAKER_01   
 7   0:00:34 - 0:00:38  SPEAKER_00   
 8   0:00:39 - 0:00:50  SPEAKER_01   
 9   0:00:50 - 0:00:58  SPEAKER_00   
 10  0:00:59 - 0:01:10  SPEAKER_01   
 11  0:01:11 - 0:01:25  SPEAKER_01   
 12  0:01:25 - 0:01:36  SPEAKER_00   
 13  0:01:38 - 0:01:46  SPEAKER_00   
 14  0:01:46 - 0:01:47  SPEAKER_01   
 15  0:01:49 - 0:01:54  SPEAKER_00   
 16  0:01:55 - 0:02:01  SPEAKER_01   
 17  0:02:01 - 0:02:11  SPEAKER_00   
 18  0:02:11 - 0:02:12  SPEAKER_01   
 19  0:02:12 - 0:02:16  SPEAKER_00   
 20  0:02:16 - 0:02:17  SPEAKER_01   
 21  0:02:18 - 0:02:21  SPEAKER_00   
 22  0:02:22 - 0:02:23  SPEAKER_01   
 23  0:02:23 - 0:02:25  SPEAKER_00   
 24  0:02:33 - 0:02:36  SPEAKER

In [85]:
for file in os.listdir("Raw_Transcriptions/"):
    Speaker_Asignation(f"Raw_Transcriptions/{file}")