In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import json 
import csv
import re
import string
import os

## Objective:
#### - To analyze the vocal features of the speakers in the earnings call, we need to automate the process of extracting 1 min clipping of all the speakers in the call.
#### - We have audio clippings and the transcripts of all the earning calls.
#### - We will use a forced alignment package, Gentle, to map timestamps of the transcripts with the audio file to help us in cropping audio files.

In [6]:
# Sample Transcript file

sample = pd.read_csv('D://forced Alignment/Seoung/BWLD_20111019_w timestamp - sample.csv')

sample.head(5)

Unnamed: 0,ticker,Call title,Quarter,Year,actual call date,company name,Raw speaker info,Speaker name,additional speaker info,Words spoken,truncated,em dash
0,BWLD,Q3 2011 Buffalo Wild Wings Inc Earnings Confer...,Q3,2011,19-Oct-11,Buffalo Wild Wings Inc,OPERATOR,OPERATOR,,"Good afternoon, ladies and gentlemen. Welcome ...",,
1,BWLD,Q3 2011 Buffalo Wild Wings Inc Earnings Confer...,Q3,2011,19-Oct-11,Buffalo Wild Wings Inc,"MARY TWINEM, EVP, CFO, BUFFALO WILD WINGS, INC.",MARY TWINEM,"EVP, CFO, BUFFALO WILD WINGS, INC.","Good afternoon, and thank you for joining us a...",,
2,BWLD,Q3 2011 Buffalo Wild Wings Inc Earnings Confer...,Q3,2011,19-Oct-11,Buffalo Wild Wings Inc,"SALLY SMITH, PRESIDENT AND CEO, BUFFALO WILD W...",SALLY SMITH,"PRESIDENT AND CEO, BUFFALO WILD WINGS, INC.","Good afternoon, everyone. Demand for the Buffa...",,
3,BWLD,Q3 2011 Buffalo Wild Wings Inc Earnings Confer...,Q3,2011,19-Oct-11,Buffalo Wild Wings Inc,MARY TWINEM,MARY TWINEM,"EVP, CFO, BUFFALO WILD WINGS, INC.","Thank you, Sally. Our revenue in the third qua...",,
4,BWLD,Q3 2011 Buffalo Wild Wings Inc Earnings Confer...,Q3,2011,19-Oct-11,Buffalo Wild Wings Inc,SALLY SMITH,SALLY SMITH,"PRESIDENT AND CEO, BUFFALO WILD WINGS, INC.","Thank you, Mary. We are very pleased with our ...",,


#### Screen shot from Sample JSON File 

##### When Gentle was successfully able to align a word    

<img src="Success.PNG" width="150" align = "left" />

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>

<br>
<br>


##### When Gentle was unsuccessful in aligning  a word

<img src="Failure.PNG" width="200" align = "left" />

In [2]:
# Folder with JSON files generated by Gentle with Timestamps for each word
json_folder = 'D://forced Alignment/Gandhar/'

# Folder with transcript files
transcript_folder = 'D://forced Alignment/Seoung/'

# Folder where the updated csv file with timestamp will be saved
output_folder = 'D://forced Alignment/Updated/'

In [3]:
data = {}

In [4]:
# Function to remove the symbols that are not interpretable by Gentle
def remove_punct(text):
     return re.sub("[^a-zA-Z0-9' ]+", ' ', text)

In [7]:
# Reading JSON Files

for filename in os.listdir(json_folder):
    with open(json_folder+filename, "r") as read_file:
        json_data = json.load(read_file)
        data[filename.split('_')[0]] = json_data['words']

In [8]:
# Function to Get ending Timestamp

def get_end_time(tup, data):
    
    # get the end and start of the spoken words for the speaker
    end = tup['Count end']
    start = tup['Count start']
    
    # Check if the end word is the last word of the audio clip
    if end != len(data):
        # If Gentle was succesfully able align the end word then return the time
        if data[end]['case'] == 'success':
            sec = data[end]['end']
            minutes =sec//60
            rem = int(sec - (minutes*60))
            if rem//10 == 0:
                return str(int(minutes))+'.0'+str(rem)
            else:
                return str(int(minutes))+'.'+str(rem)
        
        # Else search till you find a word that was successfully aligned by Gentle and end is greater than start
        else:
            while (data[end-1]['case'] != 'success') and (end > start):
                end = end-1
            try:
                sec = data[end-1]['end']
            except:
                return 'not Found'
            minutes =sec//60
            rem = int(sec - (minutes*60))
            if rem//10 == 0:
                return str(int(minutes))+'.0'+str(rem)
            else:
                return str(int(minutes))+'.'+str(rem)
    # If the word is the last word then subtract 1 from end and call the function itself          
    else:
        tup['Count end'] = tup['Count end']-1
        return(get_end_time(tup,data))

In [189]:
# Function to Get starting Timestamp

def get_start_time(tup, data):
    
    # get the end and start of the spoken words for the speaker
    end = tup['Count end']
    start = tup['Count start']
    
    if start != len(data):
        if data[start]['case'] == 'success':
            sec = data[start]['end']
            minutes =sec//60
            rem = int(sec - (minutes*60))
            if rem//10 == 0:
                return str(int(minutes))+'.0'+str(rem)
            else:
                return str(int(minutes))+'.'+str(rem)
        
        # Else search till you find a word that was successfully aligned by Gentle and end is greater than start
        else:
            while (data[start+1]['case'] != 'success') and (start < end):
                start = start+1
            try:
                sec = data[start+1]['end']
            except:
                return 'Not Found'
            minutes =sec//60
            rem = int(sec - (minutes*60))
            if rem//10 == 0:
                return str(int(minutes))+'.0'+str(rem)
            else:
                return str(int(minutes))+'.'+str(rem)
    else:
        tup['Count start'] = tup['Count start']-1
        return(get_start_time(tup,data))


In [9]:
# Function to Get starting Timestamp

def get_start_time(tup, data):
    
    # get the end and start of the spoken words for the speaker
    end = tup['Count end']
    start = tup['Count start']
    
    # If Gentle was succesfully able align the end word then return the time
    if data[start]['case'] == 'success':
        sec = data[start]['end']
        minutes =sec//60
        rem = int(sec - (minutes*60))
        if rem//10 == 0:
            return str(int(minutes))+'.0'+str(rem)
        else:
            return str(int(minutes))+'.'+str(rem)
    else:
        while (data[start+1]['case'] != 'success') and (start < end):
            start = start+1
        try:
            sec = data[start+1]['end']
        except:
            return 'Not Found'
        minutes =sec//60
        rem = int(sec - (minutes*60))
        if rem//10 == 0:
            return str(int(minutes))+'.0'+str(rem)
        else:
            return str(int(minutes))+'.'+str(rem)



In [14]:
for filename in os.listdir(transcript_folder):
    
    print(filename)
    
    # Get the date and company ticker from the filename
    date = filename.split('_')[1]
    ticker = filename.split('_')[0]
    
    # Read transcript csv from the foler
    df = pd.read_csv(transcript_folder+filename)
    
    # Remove the symbols not recognized by Gentle transcript
    df['Words spoken'] = df['Words spoken'].apply(lambda x: remove_punct(x))
    
    # Get number of words spoken for each row
    df['Count'] = df['Words spoken'].apply(lambda x: len(x.split()))
    
    # Cumulatively sum the counts to get the index for last spoken word for each row
    df['Count end'] = df['Count'].cumsum()
    
    # Get Starting word from end word
    df['Count start'] = df['Count end'].shift(1)
    df['Count start'] = df['Count start'] + 1
    df['Count start'].fillna(0, inplace = True)
    df['Count start'] = df['Count start'].astype('int64', inplace = True)
    
    # Get the starting and ending time
    df['start_time'] = df[['Count end', 'Count start']].apply(lambda x: get_start_time(x,data[ticker]), axis = 1)
    df['end_time'] = df[['Count end', 'Count start']].apply(lambda x: get_end_time(x,data[ticker]), axis = 1)


    df.to_csv(output_folder+ticker+'_'+date+'_'+'gentle_time_sec.csv', index=False, float_format='%.2f')

BWLD_20111019_w timestamp.csv
EGL_20170309_w timestamp.csv
NCI_20130214_w timestamp.csv
WSM_20110315_w timestamp.csv
XRX_20121023_w timestamp.csv


In [15]:
# Start and end time is generated in the DF

df.head(5)

Unnamed: 0,ticker,Call title,Quarter,Year,actual call date,company name,Raw speaker info,Speaker name,additional speaker info,Words spoken,truncated,em dash,Count,Count end,Count start,start_time,end_time
0,XRX,Q3 2012 Xerox Corporation Earnings Conference ...,Q3,2012,23-Oct-12,Xerox Corporation,OPERATOR,OPERATOR,,Good morning and welcome to the Xerox Corporat...,,,164,164,0,0.0,1.06
1,XRX,Q3 2012 Xerox Corporation Earnings Conference ...,Q3,2012,23-Oct-12,Xerox Corporation,"URSULA BURNS, CHAIRMAN AND CEO, XEROX CORPORATION",URSULA BURNS,"CHAIRMAN AND CEO, XEROX CORPORATION",Good morning and thanks for joining us today ...,,,1405,1569,165,1.07,9.44
2,XRX,Q3 2012 Xerox Corporation Earnings Conference ...,Q3,2012,23-Oct-12,Xerox Corporation,"LUCA MAESTRI, EVP AND CFO, XEROX CORPORATION",LUCA MAESTRI,"EVP AND CFO, XEROX CORPORATION",Thank you Ursula and good morning everyone ...,,,1755,3324,1570,9.44,21.27
3,XRX,Q3 2012 Xerox Corporation Earnings Conference ...,Q3,2012,23-Oct-12,Xerox Corporation,URSULA BURNS,URSULA BURNS,"CHAIRMAN AND CEO, XEROX CORPORATION",Thanks Luca Let me quickly wrap up so that w...,,,259,3583,3325,21.28,22.53
4,XRX,Q3 2012 Xerox Corporation Earnings Conference ...,Q3,2012,23-Oct-12,Xerox Corporation,OPERATOR,OPERATOR,,Ananda Baruah Brean Capital,,,4,3587,3584,22.53,23.01
