In [1]:
project_id = 'test-281700'
!gcloud config set project {project_id}
!gsutil ls

Updated property [core/project].
gs://spotify_asr_dataset/


In [2]:
bucket_name = 'spotify_asr_dataset'
!gsutil -m cp -r gs://{bucket_name}/dataset.csv /home/jupyter
!gsutil -m cp -r gs://{bucket_name}/filtered-episode-ids.txt /home/jupyter
!gsutil -m cp -r gs://{bucket_name}/metadata.tsv /home/jupyter

Copying gs://spotify_asr_dataset/dataset.csv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

/ [1/1 files][  2.9 GiB/  2.9 GiB] 100% Done     0.0 B/s                        
Operation completed over 1 objects/2.9 GiB.                                      
Copying gs://spotify_asr_dataset/filtered-episode-ids.txt...
/ [1/1 files][  2.5 MiB/  2.5 MiB] 100% Done                                    
Operation completed over 1 objects/2.5 MiB.                                      
Copying gs://spotify_asr_dataset/metadata.tsv...
| [1/1 files][112.2 MiB/112.2 MiB] 100% Done                                    
Operation completed over 1 objects/112.2 MiB.                                    


In [12]:
import pandas as pd
dataset = pd.read_csv('dataset.csv')
dataset.head(1)

Unnamed: 0,episode_id,transcript
0,spotify:episode:399kdfMnjw0KYANZU7CQJ0,It's the mother back a podcast. Well that was...


In [13]:
filter = pd.read_csv('filtered-episode-ids.txt', sep=" ", header=None, names=["episode_id"])
filter.head(1)

Unnamed: 0,episode_id
0,spotify:episode:000A9sRBYdVh66csG2qEdj


In [14]:
metadata = pd.read_csv('metadata.tsv', sep='\t')
metadata.head(1)

Unnamed: 0,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix
0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,['en'],https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj


In [15]:
dataset = dataset.merge(metadata, how='inner', right_on='episode_uri', left_on='episode_id')
del dataset['episode_uri']
dataset.head(1)

Unnamed: 0,episode_id,transcript,show_uri,show_name,show_description,publisher,language,rss_link,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix
0,spotify:episode:399kdfMnjw0KYANZU7CQJ0,It's the mother back a podcast. Well that was...,spotify:show:002B8PbILr169CdsS9ySTH,The Mother Bakker Podcast,The Mother Bakker Podcast is hosted by none ot...,Mim Bakker,['en-AU'],https://anchor.fm/s/102fcbb8/podcast/rss,"Ep #1 | Mim, Mems & Mates",This is the official start to The Mother Bakke...,57.1772,show_002B8PbILr169CdsS9ySTH,399kdfMnjw0KYANZU7CQJ0


In [16]:
dataset = dataset.loc[dataset["episode_id"].isin(filter['episode_id'])]
dataset = dataset.dropna()

In [17]:
#clean the episode descriptions
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import string

def clean_up(text):
    head, _ , _ = text.partition(' ---')
    head = head.strip()
    if len(head) > 0:
        first_letter = head[0].capitalize()
        head = first_letter + head[1:]
        if head[-1] in string.punctuation:
            head += '.'
    return head

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
#keyword extraction class
# https://towardsdatascience.com/textrank-for-keyword-extraction-by-python-c0bae21bcec0
from collections import OrderedDict
import numpy as np
import spacy
!python -m spacy download en_core_web_sm
!python -m spacy link en_core_web_sm en
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    # def normalize(self, token):
    #     if (token.is_alpha):
    #         return token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False and token.is_alpha:
                    # token = self.normalize(token)
                    # selected_words.append(token)
                    if lower is True:
                        selected_words.append(token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_)
                    else:
                        selected_words.append(token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_)
            sentences.append(selected_words)
        return sentences
    
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=100000, limit=0.0):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        keywords = []
        for i, (key, value) in enumerate(node_weight.items()):
            if value >= limit:
                keywords.append(key)
            # print(key + ' - ' + str(value))
            # if i > number:
            #     break
        return keywords
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')

[38;5;1m✘ Link 'en' already exists[0m
To overwrite an existing link, use the --force flag



In [19]:
#get scores
def scores(transcript_keywords, summary_keywords):
    total_keywords = len(transcript_keywords)
    total_summary_keywords = len(summary_keywords)
#     print(total_keywords, total_summary_keywords)
    correct_keywords = 0

    for keyword in summary_keywords:
        if keyword in transcript_keywords:
            correct_keywords += 1

    
    precision = 0
    recall = 0
    f1 = 0
    if total_keywords == 0:
        return {'precision':precision, 'recall':recall, 'f1':f1}
    
    if total_summary_keywords > 0:
        precision = correct_keywords / total_summary_keywords
        recall = correct_keywords / total_keywords
    
    if precision > 0 and recall > 0:
        f1 = (2 *(recall * precision)) / (recall + precision)
    
    return {'precision':precision, 'recall':recall, 'f1':f1}

In [20]:
tr4w = TextRank4Keyword()
def compute_scores(transcript, summary):
    tr4w.analyze(transcript, candidate_pos = ['NOUN','PROPN'], window_size=4, lower=True)
    transcript_keywords = tr4w.get_keywords()
    tr4w.analyze(summary, candidate_pos = ['NOUN','PROPN'], window_size=4, lower=True)
    summary_keywords = tr4w.get_keywords()
    results = scores(transcript_keywords, summary_keywords)
    print(results)
    return results

In [21]:
dataset['episode_description'] = dataset.apply(lambda row: clean_up(row['episode_description']), axis=1)

In [22]:
dataset = dataset[60000:]
dataset['scores'] =  dataset.apply(lambda row: compute_scores(row['transcript'] ,row['episode_description']), axis=1)
dataset.to_csv('train_data_with_precision-6.csv')
!gsutil -m cp -r /home/jupyter/train_data_with_precision-6.csv gs://{bucket_name}/ 

{'precision': 0.6, 'recall': 0.0058823529411764705, 'f1': 0.011650485436893204}
{'precision': 0.8235294117647058, 'recall': 0.03111111111111111, 'f1': 0.05995717344753747}
{'precision': 0.5714285714285714, 'recall': 0.019834710743801654, 'f1': 0.03833865814696486}
{'precision': 0.7222222222222222, 'recall': 0.025896414342629483, 'f1': 0.05000000000000001}
{'precision': 0.5238095238095238, 'recall': 0.019400352733686066, 'f1': 0.037414965986394565}
{'precision': 0.7857142857142857, 'recall': 0.02018348623853211, 'f1': 0.03935599284436494}
{'precision': 0.8125, 'recall': 0.021311475409836064, 'f1': 0.04153354632587859}
{'precision': 0.875, 'recall': 0.023769100169779286, 'f1': 0.04628099173553719}
{'precision': 0.6666666666666666, 'recall': 0.024205748865355523, 'f1': 0.04671532846715328}
{'precision': 0.7, 'recall': 0.0166270783847981, 'f1': 0.03248259860788863}
{'precision': 0.8, 'recall': 0.02385685884691849, 'f1': 0.04633204633204634}
{'precision': 0.7692307692307693, 'recall': 0.018



{'precision': 0, 'recall': 0, 'f1': 0}
{'precision': 0.5, 'recall': 0.024282560706401765, 'f1': 0.046315789473684206}
{'precision': 0.4117647058823529, 'recall': 0.014084507042253521, 'f1': 0.027237354085603113}
{'precision': 0.6666666666666666, 'recall': 0.02058319039451115, 'f1': 0.03993344425956739}
{'precision': 0.8888888888888888, 'recall': 0.026845637583892617, 'f1': 0.05211726384364821}
{'precision': 0.8571428571428571, 'recall': 0.019801980198019802, 'f1': 0.03870967741935484}
{'precision': 0.6363636363636364, 'recall': 0.010638297872340425, 'f1': 0.020926756352765325}
{'precision': 1.0, 'recall': 0.02066115702479339, 'f1': 0.04048582995951417}
{'precision': 0.45454545454545453, 'recall': 0.007112375533428165, 'f1': 0.014005602240896359}
{'precision': 0.5483870967741935, 'recall': 0.028145695364238412, 'f1': 0.05354330708661418}
{'precision': 0.5714285714285714, 'recall': 0.022770398481973434, 'f1': 0.0437956204379562}
{'precision': 0.8235294117647058, 'recall': 0.0238500851788

In [23]:
!gsutil -m cp -r /home/jupyter/train_data_with_precision-6.csv gs://spotify_asr_dataset/ 

Copying file:///home/jupyter/train_data_with_precision-6.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

| [1/1 files][186.7 MiB/186.7 MiB] 100% Done                                    
Operation completed over 1 objects/186.7 MiB.                                    
