In [None]:
import pandas as pd
import numpy as np
import string
import csv

In [None]:
import os

# modify the line below and add the path to your google credentials json file
# further information on how to generate such a json file: https://cloud.google.com/docs/authentication/getting-started#command-line
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = PLACEHOLDER_APPLICATION_CREDENTIALS

In [None]:
from google.cloud import storage

# in this notebook we need two Google Cloud Storage buckets: https://cloud.google.com/storage/docs/creating-buckets
# in the first bucket we have the video subtitles that we extracted from the videos
# in the second bucket we store the named entities extracted from the video subtitles

# Replace all PLACEHOLDER_ variables with the variables you have created.
def list_blobs_with_prefix(bucket_name, prefix, delimiter):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    blobs = bucket.list_blobs(prefix=prefix, delimiter=delimiter)

    videos = []
    for blob in blobs:
        if not blob.name.endswith("/"):
            videos.append("gs://" + bucket_name + "/" + blob.name)
            
    return videos
            
videos = list_blobs_with_prefix(PLACEHOLDER_BUCKET_NAME_VIDEO_SUBTITLES, PLACEHOLDER_PREFIX, PLACEHOLDER_DELIMITER)
            

In [None]:
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
from google.cloud import storage

import sys
import six

# Replace all PLACEHOLDER_ variables with the variables you have created.
storage_client = storage.Client()

gcloud_bucket_name_read = PLACEHOLDER_BUCKET_NAME_VIDEO_SUBTITLES
bucket_read = storage_client.get_bucket(gcloud_bucket_name_read)

gcloud_bucket_name_write = PLACEHOLDER_BUCKET_NAME_VIDEO_SUBTITLES_ENTITIES
bucket_write = storage_client.get_bucket(gcloud_bucket_name_write)

client = language.LanguageServiceClient()

names = ["storageLink","entity","entity_type","mid","wikipedia_url","entity_mention","begin_offset","mention_sentiment_magnitude","mention_sentiment_score", "mention_type", "entity_salience", "entity_sentiment_magnitude", "entity_sentiment_score"]


for video in videos:

    videoComp = video.split("/")
    
    with open("../data/named_entities_subtitles/" + videoComp[-1], "w" ) as outSentences:
        writer = csv.writer( outSentences )
        writer.writerow(names)

    # PLACEHOLDER_WORKING_FILE refers to local file that is only used for processing purposes
    blob = bucket_read.blob(videoComp[-1])
    blob.download_to_filename(PLACEHOLDER_WORKING_FILE)

    file = pd.read_csv(PLACEHOLDER_WORKING_FILE)

    df = file.groupby(by=['storageLink','transcript','transcript_part'], as_index=False).first()
    df = df[["storageLink", "transcript", "transcript_part", "start_time", "end_time"]]
    df = df.sort_values(["transcript_part"])

    text = " ".join(list(df["transcript"]))

    
    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document = types.Document(content=text.encode('utf-8'), type=enums.Document.Type.PLAIN_TEXT)

    # Detect and send native Python encoding to receive correct word offsets.
    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16

    result = client.analyze_entity_sentiment(document, encoding)

    for entity in result.entities:
        for mention in entity.mentions:
            line = [videoComp[-1][0:-8]]
            entity_type = enums.Entity.Type(entity.type)
            mention_type = enums.EntityMention.Type(mention.type)
            line.extend([entity.name, entity_type.name, entity.metadata.get('mid', ''),entity.metadata.get('wikipedia_url', ''), mention.text.content, mention.text.begin_offset, mention.sentiment.magnitude, mention.sentiment.score, mention_type.name, entity.salience, entity.sentiment.magnitude, entity.sentiment.score])        

            with open("../data/named_entities_subtitles/" + videoComp[-1], "a" ) as outSentences:
                writer = csv.writer( outSentences )
                writer.writerow(line)
    blob = bucket_write.blob(videoComp[-1])
    blob.upload_from_filename("../data/named_entities_subtitles/" + videoComp[-1])



In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def nltk2wn_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_keywords(keyword):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(str(keyword)))  
    wn_tagged = map(lambda x: (str(x[0]), nltk2wn_tag(x[1])), nltk_tagged)
    res_words = []
    for word, tag in wn_tagged:
        if tag is None:            
            res_word = wordnet._morphy(str(word), wordnet.NOUN)
            if res_word == []:
                res_words.append(str(word))
            else:
                if len(res_word) == 1:
                    res_words.append(str(res_word[0]))
                else:
                    res_words.append(str(res_word[1]))
        else:
            res_word = wordnet._morphy(str(word), tag)
            if res_word == []:
                res_words.append(str(word))
            else: 
                if len(res_word) == 1:
                    res_words.append(str(res_word[0]))
                else:
                    res_words.append(str(res_word[1]))
        
    lematized_keyword = " ".join(res_words)
  
    return lematized_keyword

In [None]:
# This section of the notebook is used to enrich the type of named entities by looking into various knowledge bases, such as:
#  - Wikidata type
#  - DBpedia type
#  - WordNet type

# we also update the files with the lemma of each named entity and the lowercase lemma of each named entity


from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
from google.cloud import storage

import sys
import six


"""Example of Python client calling Knowledge Graph Search API."""
import json
import urllib.parse

# extract the wikipedia page linked to the dbpedia page

from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("http://dbpedia.org/sparql")


api_key = PLACEHOLDER_API_KEY_KGSEARCH
service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
storage_client = storage.Client()

gcloud_bucket_name_read = PLACEHOLDER_BUCKET_NAME_VIDEO_SUBTITLES_ENTITIES
bucket_read = storage_client.get_bucket(gcloud_bucket_name_read)

gcloud_bucket_name_write = PLACEHOLDER_BUCKET_NAME_VIDEO_SUBTITLES_ENTITIES
bucket_write = storage_client.get_bucket(gcloud_bucket_name_write)

video_entities = list_blobs_with_prefix(PLACEHOLDER_BUCKET_NAME_VIDEo_SUBTITLES_ENTITIES, PLACEHOLDER_PREFIX, PLACEHOLDER_DELIMITER)

client = language.LanguageServiceClient()

person = ["noun.person"]
organization = ["noun.group"]
location = ["noun.location"]
event = ["noun.event"]

for video in video_entities:
    
    videoComp = video.split("/")

    blob = bucket_read.blob(videoComp[-1])
    blob.download_to_filename(PLACEHOLDER_WORKING_FILE)

    file = pd.read_csv(PLACEHOLDER_WORKING_FILE)
    
    file["entity_lemma"] = ""
    file["entity_lemma_lower"] = ""
    file["dbpedia_type"] = ""
    file["wikidata_type"] = ""
    file["all_dbpedia_types"] = ""
    file["all_wikidata_types"] = ""
    file["WordNet Type"] = ""

    for i in range(len(file)):
        file["entity_lemma"].iloc[i] = lemmatize_keywords(file["entity"].iloc[i])
        file["entity_lemma_lower"].iloc[i] = lemmatize_keywords(file["entity"].iloc[i].lower())

        query = file["entity"].iloc[i]
        params = {
            'query': query,
            'limit': 5,
            'indent': True,
            'key': api_key,
        }
        url = service_url + '?' + urllib.parse.urlencode(params)
        response = json.loads(urllib.request.urlopen(url).read())
        found = False
        types = []
        all_types = []
        for element in response['itemListElement']:
            #print(element)
            if 'name' in element['result']:
                if element['result']['name'].lower() == query.lower() or element['result']['name'].lower() == file["entity"].iloc[i].lower():
                    all_types.extend(element['result']['@type'])
                    if 'Event' in element['result']['@type']:
                        types.append("EVENT")
                        found = True
                    if 'Organization' in element['result']['@type']:
                        types.append("ORGANIZATION")
                        found = True
                    if 'Place' in element['result']['@type']:
                        types.append("LOCATION")
                        found = True
                    if 'Person' in element['result']['@type']:
                        types.append("PERSON")
                        found = True

        if found == False:
            types.append("OTHER")
        else:
            types = list(set(types))

        file["wikidata_type"].iloc[i] = ", ".join(types)

        if len(all_types) != 0:
            file["all_wikidata_types"].iloc[i] = ", ".join(list(set(all_types)))
        else:
            file["all_wikidata_types"].iloc[i] = "OTHER"

            

        page = "http://dbpedia.org/resource/" + file["entity"].iloc[i].capitalize().replace(" ", "_")
        encoding = 'utf-8'

        sparql.setQuery("""
            PREFIX dbo: <http://dbpedia.org/ontology/>
            PREFIX dbp: <http://dbpedia.org/resource/>
            PREFIX foaf: <http://xmlns.com/foaf/0.1/>

            SELECT ?types where {
                <""" + page + """> rdf:type ?types .
            }
        """)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        ent_types = []

        if len(results["results"]["bindings"]) != 0:
            for ent_type in results["results"]["bindings"]:
                if 'ontology' in ent_type["types"]["value"]:
                    ent_types.append(ent_type["types"]["value"])

        file["all_dbpedia_types"].iloc[i] = ", ".join(list(set(ent_types)))

        found = False
        types = []
        for ent_type in ent_types:

            if 'Event' in ent_type:
                types.append("EVENT")
                found = True
            if 'Organization' in ent_type or 'Organisation' in ent_type:
                types.append("ORGANIZATION")
                found = True
            if 'Place' in ent_type:
                types.append("LOCATION")
                found = True
            if 'Person' in ent_type:
                types.append("PERSON")
                found = True

        if found == False:
            file["dbpedia_type"].iloc[i] = "OTHER"
        else:
            file["dbpedia_type"].iloc[i] = ", ".join(list(set(types)))


        types1 = []

        for synset in wordnet.synsets(file["entity"].iloc[i].replace(" ", "_")):
            #print(synset.lexname())
            if synset.lexname() in person:
                if "PERSON" not in types1:
                    types1.append("PERSON")
            if synset.lexname() in organization:
                if "ORGANIZATION" not in types1:
                    types1.append("ORGANIZATION")
            if synset.lexname() in location:
                if "LOCATION" not in types1:
                    types1.append("LOCATION")
            if synset.lexname() in event:
                if "EVENT" not in types1:
                    types1.append("EVENT")
        if len(types1) != 0:
            file["WordNet Type"].iloc[i] = ", ".join(types1)
    
    file.to_csv(PLACEHOLDER_WORKING_FILE, index=False)
    
    blob = bucket_write.blob(videoComp[-1])
    blob.upload_from_filename(PLACEHOLDER_WORKING_FILE)
    