# Import Dependencies

In [1]:
from pprint import pprint
import pandas as pd
import os
from elasticsearch import Elasticsearch 
import csv
import json
from typing import List

import nltk
# Only run this once, they will be downloaded.
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)

True

In [2]:
# Connect to Elasticsearch
es = Elasticsearch("http://localhost:9200")

# Define the root directory of the file structure for podcast trascripts
path = 'spotify-podcasts-2020-summarization-testset/spotify-podcasts-2020/'
directory = f'{path}podcasts-transcripts-summarization-testset'



In [3]:
def get_episode_paths(dir:str, file_paths:List[str]=[]):->List[str]:

    """Recursively walk through a directory and find all the files 
    that are episodes 

    Args:
        dir (str): Directory to traverse
        file_paths (List[str], optional): List of file paths. Defaults to [].

    Returns:
        List[str]: List of file paths
    """

    # loop through directory
    for item in os.listdir(dir):

        # Get the path of the transcript
        item_path = os.path.join(dir, item)

        # If it's a directory, recursively traverse it
        if os.path.isdir(item_path):

            # Call the function again
            get_episode_paths(item_path, file_paths)

        else:
            # If it's a file, add it to the list of file paths
            file_paths.append(item_path)

    # Return the list of file paths
    return file_paths

# Call the function to get all the file paths of all the transcripts
file_paths = get_episode_paths(directory)

In [4]:
def open_transcript_file(item_path: str):->list:

    """
    Opens the transcript file and returns a list of sentences

    Args:
        item_path (str): The path to the transcript file

    Returns:
        list: A list of sentences
    """

    # print(item_path)

    # Get the show URI
    show_uri = 'spotify:show:' + item_path.split('show_')[1]

    # Open the transcript file
    transcript_file_path = open(item_path, 'r')

    # Get the episode URI
    episode_uri =  'spotify:episode:' + os.path.basename(transcript_file_path.name).split('.json')[0]

    # Load the transcript file
    raw_transcript = json.load(open(item_path))

    # Create a list to store the transcript
    complete_transcript = []

    # Iterate through the results
    for result in range(0, len(raw_transcript['results'])):
        # Get the transcript
        item = raw_transcript['results'][result]
        
        # Check if the transcript json is not empty
        if 'alternatives' in item and len(item['alternatives']) > 0:

            # Identify upper level of json with transcript
            alt = item['alternatives'][0]

            # Check if the transcript is valid
            if 'transcript' in alt and 'confidence' in alt and 'words' in alt:

                # Get the transcript
                transcript_piece = alt['transcript']

                # Append the transcript to the list
                complete_transcript.append(transcript_piece)
            else:
                # Handle the missing data or log a warning
                continue
        else:
            # Handle the missing data or log a warning
            continue
    
    # Join the list of sentences into a single string
    joined_transcript = ' '.join([w for w in complete_transcript])

    sent_tokens = nltk.sent_tokenize(joined_transcript)  # Converts to a list of sentences
    
    # Create a list to store the indexed transcripts
    transcript_sentences = []
    
    for index, sentence in enumerate(sent_tokens):

        # Index the transcript sentences 1 by 1 into Elasticsearch
        index_review = {
            '_index': "transcripts_sentences",
            '_id': episode_uri + '_' + str(index),
            '_source': {
                'show_uri': show_uri,
                'episode_uri': episode_uri,
                'sentence': sentence
            }
        }
        
        # Append the index_review dictionary to the list
        transcript_sentences.append(index_review)

    return transcript_sentences


In [9]:
# Create a list to store the indexed transcript sentences
list__podcast_transcripts = []

# Loop through the podcast transcript files
for file_path in file_paths:
    # Open the transcript file
    transcript_indices = open_transcript_file(file_path)
    # Add the transcript indices to the list
    list__podcast_transcripts.extend(transcript_indices)


spotify-podcasts-2020-summarization-testset/spotify-podcasts-2020/podcasts-transcripts-summarization-testset\0\1\show_015DbLwcXu2fK7e9jIfbFo\74t5WREXUbhEKNI89CNSkL.json
spotify-podcasts-2020-summarization-testset/spotify-podcasts-2020/podcasts-transcripts-summarization-testset\0\1\show_01DbRiALDPdvZdoiY8yQL6\5fG4VlWnWwzAt6mSs0H7lY.json
spotify-podcasts-2020-summarization-testset/spotify-podcasts-2020/podcasts-transcripts-summarization-testset\0\1\show_01eumErJvBdxCW4YJivbwc\2WQ1GcC6J0k7qsO8Vvf2be.json
spotify-podcasts-2020-summarization-testset/spotify-podcasts-2020/podcasts-transcripts-summarization-testset\0\1\show_01Txd706SjsgvM0cm0UXuM\5hvOWPoB0j6HMrSVAMtJLV.json
spotify-podcasts-2020-summarization-testset/spotify-podcasts-2020/podcasts-transcripts-summarization-testset\0\1\show_01Txd706SjsgvM0cm0UXuM\7JG3lLnRoDdOxuqjf14ZkM.json
spotify-podcasts-2020-summarization-testset/spotify-podcasts-2020/podcasts-transcripts-summarization-testset\0\2\show_02Yjg2GfjecTzu0NbTV1HD\3kkhUQJ9DXYs6a

In [11]:
# preview number of sentences indexed
len(list__podcast_transcripts)

359330

In [12]:
# Index each document (transcript sentence) individually
for transcript_index in list__podcast_transcripts:
    
    es.index(index=transcript_index["_index"], id=transcript_index["_id"], body=transcript_index["_source"])

print("Indexing completed.")

Indexing completed.


## Sanity Test:

Execute following commands from the command window/terminal:

**To verify the index got created:**
- curl -XGET "http://localhost:9200/_cat/indices?v"


**To dump the documents in the index to stdout:**
- curl -XGET "http://localhost:9200/transcripts_sentences/_search?pretty=true"

**To dump the mapping of  the index to stdout:**
- curl -XGET "localhost:9200/transcripts_sentences/_mapping?pretty"


## How to delete  the index?
**To delete the index you created:**
- curl -XDELETE "http://localhost:9200/chicago_yelp_reviews?pretty"

