In [None]:
APIKEY="####"

In [None]:
!pip install --upgrade google-api-python-client
!pip install --upgrade google-cloud-speech 
!pip install --upgrade google-cloud-language

In [None]:
import io
import os
import sys
# Imports the Google Cloud client library
from google.cloud import speech

from googleapiclient.discovery import build
from google.cloud import language

import six
import logging
import os
import re
import pandas as pd
from google.datalab import storage
from datalab.context import Context


In [None]:
class Audio_transformation():
  def __init__(self,bucket_name):
    
    # Instantiates a client
    self.speech_client=speech.SpeechClient()
    self.language_client = language.LanguageServiceClient()
    #Getting access to a certain bucket
    self.bucket=storage.Bucket(bucket_name)
    #name of the files with certain prefix
    
    self.lservice = build('language', 'v1beta1', developerKey=APIKEY)
    self.file_keys=[]
    self.transcripts=[]
    #google cloud bucket storage path
    self.dir="gs://"+bucket_name
    
    
  #setting the list of files with desired prefix
  def get_files_by_prefix(self,prefix):
    p=re.compile(prefix)
    
    for obj in self.bucket.objects():
      if(p.match(obj.key)):
         self.file_keys.append(obj.key)
          
  #converting list of Audio to transcript conversions   
  def get_transcript(self):
    for key in self.file_keys:
      url=self.dir+'/'+key
      
      audio = speech.types.RecognitionAudio(uri=url)
      
      config = speech.types.RecognitionConfig(
        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code='en-US')
        
      operation = self.speech_client.long_running_recognize(config, audio)
      response = operation.result(timeout=90)
      quotes=[result.alternatives[0].transcript for result in response.results]
      self.transcripts.append(quotes)
    
  #getting categories assigend to each corpus  
  def get_cat(self,text):
    if isinstance(text, six.binary_type):
      text = text.decode('utf-8')

    document = language.types.Document(
      content=text.encode('utf-8'),
      type=language.enums.Document.Type.PLAIN_TEXT)

    categories = self.language_client.classify_text(document).categories
    return [category.name for category in categories]

  #getting series of categories assigend to each transcript
  def get_Category(self,quotes):
    
    texts=[q for q in quotes if len(q.split())>20]
    Categories_of_texts=pd.Series(texts,index=texts)
    Categories_of_texts=Categories_of_texts.apply(self.get_cat)
    return Categories_of_texts
  
  #getting magnitude assigend to each corpus  
  def get_mag(self,quote):
    
    response = self.lservice.documents().analyzeSentiment(
    body={
      'document': {
         'type': 'PLAIN_TEXT',
         'content': quote
      }
    }).execute()
    return response['documentSentiment']['magnitude']

  #getting series of magnitude assigend to each transcript
  def get_Magnitude(self,quotes):
    
    texts=[q for q in quotes if len(q.split())>20]
    magnitude_of_texts=pd.Series(texts,index=texts)
    magnitude_of_texts=magnitude_of_texts.apply(self.get_mag)
    return magnitude_of_texts
  
  
  #getting polarity assigend to each corpus  
  def get_pol(self,quote):
    
    response = self.lservice.documents().analyzeSentiment(
    body={
      'document': {
         'type': 'PLAIN_TEXT',
         'content': quote
      }
    }).execute()
    return response['documentSentiment']['polarity']

  #getting series of polarity assigend to each transcript
  def get_Polarity(self,quotes):
    
    texts=[q for q in quotes if len(q.split())>20]
    polarity_of_texts=pd.Series(texts,index=texts)
    polarity_of_texts=polarity_of_texts.apply(self.get_pol)
    return polarity_of_texts
  
  def get_ent(self,text):
      if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

      document = language.types.Document(
      content=text.encode('utf-8'),
      type=language.enums.Document.Type.PLAIN_TEXT)

    # Detect and send native Python encoding to receive correct word offsets.
      encoding = language.enums.EncodingType.UTF32
      if sys.maxunicode == 65535:
        encoding = language.enums.EncodingType.UTF16

      result = self.language_client.analyze_entity_sentiment(document, encoding)
      entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
                   'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')
      return set((entity.name,entity_type[entity.type]) for entity in result.entities)

  #getting series of entity assigend to each transcrips
  def get_Entity(self,quotes):
    
    texts=[q for q in quotes if len(q.split())>20]
    entity_of_texts=pd.Series(texts,index=texts)
    entity_of_texts=entity_of_texts.apply(self.get_ent)
    return entity_of_texts
  
  
  #creating csv files with each row containing information of copus of an audio file transcript
  def generate_analysis_files_per_corpus(self,Category=True,Magnitude=True,Polarity=True,Entity=True):
    if(Category==False and Magnitude==False and Polarity==False and Entity==False):
            return 0
    for i in range(len(self.transcripts)):
      df=pd.DataFrame(index=self.transcripts[i])
      if(Category):
        
        Categories_of_texts=self.get_Category(self.transcripts[i])
        df["Categories"]=Categories_of_texts
        
      if(Magnitude):
        magnitude_of_texts=self.get_Magnitude(self.transcripts[i])
        df['Magnitude']=magnitude_of_texts
        
      if(Polarity):
        polarity_of_texts=self.get_Polarity(self.transcripts[i])
        df['Polarity']=polarity_of_texts
        
      if(Entity):
        entity_of_texts=self.get_Entity(self.transcripts[i])
        df['Entity']=entity_of_texts
        
      name_of_file='Per_corpus_analysis'+self.file_keys[i][24:30]+'.csv'
      df.to_csv(name_of_file)
    
  #creating csv files with single row containing information of  an audio file transcript
  def generate_analysis_files_per_Audiofile(self,Category=True,Magnitude=True,Polarity=True,Entity=True):
    if(Category==False and Magnitude==False and Polarity==False and Entity==False):
            return 0
    for i in range(len(self.transcripts)):
        texts=[q for q in self.transcripts[i] if len(q.split())>20]
        text=' '.join(texts)
        df=pd.DataFrame(index=[text])
        if(Category):
            Categories_of_texts=self.get_cat(text)
            df["Categories"]=Categories_of_texts
            
        if(Magnitude):
          magnitude_of_texts=self.get_mag(text)
          df['Magnitude']=magnitude_of_texts
        if(Polarity):
          polarity_of_texts=self.get_pol(text)
          df['Polarity']=polarity_of_texts
        
        if(Entity):
          
          df['Entity']=[self.get_ent(text)]
          
        name_of_file='full_Audio_analysis'+self.file_keys[i][24:30]+'.csv'
        df.to_csv(name_of_file)
      
      
      
      
      
    
      
      