---

<p><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/5/50/Oracle_logo.svg/2560px-Oracle_logo.svg.png" width="200" align = "left"></p>

# **<h1 align ="middle"><b> Oracle CloudWorld - Las Vegas</b></h1>**

### **<h1 align ="middle"><b> Use case 1. Person Detection in Video</b></h1>**
### **<h1 align ="middle"><b> Use case 2. Offensive Language Detection in Video</b></h1>**

---

#### In this notebook, all the individual .py files are listed

# **| 1. Video_only.py**

## **1.1 Script video_only.py**

In [3]:
%%writefile /home/datascience/ocw_las_vegas/job_artifacts/sub_packages/video_only.py
import os
import numpy as np
import pandas as pd 
from deepface import DeepFace
import uuid
import glob      
import ocifs
import base64
import io
import matplotlib.pyplot as plt
import cv2
import fsspec
from PIL import Image
from io import BytesIO
from pytube import YouTube
import numpy as np
import pandas as pd 
import oci
import json
import ocifs
import time
from oci.object_storage import ObjectStorageClient
from oci.ai_language import AIServiceLanguageClient
from oci.ai_language.models import DetectLanguageKeyPhrasesDetails
from oci.ai_language.models import DetectLanguageSentimentsDetails
from oci.ai_speech import AIServiceSpeechClient
from oci.ai_speech.models import TranscriptionModelDetails
from oci.ai_speech.models import ObjectLocation
from oci.ai_speech.models import ObjectListInlineInputLocation
from oci.ai_speech.models import OutputLocation
from oci.ai_speech.models import CreateTranscriptionJobDetails


##########################################################################################################################################
##########################################################################################################################################
######################################################## Use case 1          #############################################################
##########################################################################################################################################
##########################################################################################################################################

##########################################################################################################################################
######################################################## Function 1          #############################################################
##########################################################################################################################################

def input_youtube_video(YOUTUBE_URL):
    
    #delete previous videos
    os.system("rm -r /home/datascience/youtube_videos")

    #create a local directory to store the video
    path_input_locally = "/home/datascience/youtube_videos/" 

    try:       
        if not os.path.exists(path_input_locally):         
            os.makedirs(path_input_locally)    

    except OSError: 
        print ('Error: Creating directory for youtube video locally')
        

    #download file from youtube
    yt = YouTube(YOUTUBE_URL)

    #store in local folder
    stream = yt.streams.get_by_itag(22)
    file_name_random = str(uuid.uuid4())
    file_location_local = stream.download(output_path=path_input_locally, filename  = file_name_random + ".mp4")
    
    print("Youtube download completed and stored in " + str(file_location_local))
    
    return file_location_local

##########################################################################################################################################
######################################################## Function 2          #############################################################
##########################################################################################################################################

def input_profile_image(full_bucket_name):
    
    #create a local directory to store the image
    path_input_locally_image = "/home/datascience/profile_image/" 

    try:       
        if not os.path.exists(path_input_locally_image):         
            os.makedirs(path_input_locally_image)    

    except OSError: 
        print ('Error: Creating directory for profile image locally')

    print("Full bucket name is " + full_bucket_name)
    
    #get the image from the bucket and store locally
    fs = ocifs.OCIFileSystem()
    fs.invalidate_cache(full_bucket_name)
    fs.get((full_bucket_name + "*.jpg"), path_input_locally_image , recursive=True, refresh=True)
    
    #get file name    
    file = [os.path.basename(x) for x in glob.glob(path_input_locally_image + '*.jpg')]
    profile_image_name = file[0]
    profile_image_loc = path_input_locally_image + file[0]
    print("Profile image is stored locally at " + profile_image_loc)
        
    #delete input image from bucket to clear the bucket
    delete_object = os.path.join(full_bucket_name, file[0])
    fs.rm(delete_object, recursive=True)
    
    print("Image stored locally and removed from bucket")
    
    return profile_image_name, profile_image_loc


##########################################################################################################################################
######################################################## Predict             #############################################################
##########################################################################################################################################

def predict_video(full_bucket_name, YOUTUBE_URL, SCHEMA_NAME):
    
    # Delete images if there are images in the local folder already
    path_split_images = "/home/datascience/split_images"
    files = glob.glob('/home/datascience/split_images/*.jpg')

    for f in files:
        os.remove(f)

     #create a local folder to the images
    path_split_images = "/home/datascience/split_images"

    try:       
        # creating a folder named split_images 
        if not os.path.exists(path_split_images):         
            os.makedirs(path_split_images)    

    except OSError: 
        print ('Error: Creating directory of data for split images')
        
    ######
    ###### Function 1    
    file_location_local = input_youtube_video(YOUTUBE_URL)
    
    ######
    ###### Function 2
    profile_image_name, profile_image_loc = input_profile_image(full_bucket_name)  
    
    print("Fetching video from " + file_location_local)
    print("Fetching profile image from " + profile_image_loc)
    
    #####
    ##### Main function video_only
    
    # Read the video from specified path 
    cam = cv2.VideoCapture(file_location_local)
    
    #get fps of original video
    fps = cam.get(cv2.CAP_PROP_FPS)
    number_of_frames = cam.get(cv2.CAP_PROP_FRAME_COUNT)
    print("**************************************************************** Original fps in video is " + str(fps))
    
    # calculate duration of the video
    total_duration_video = round(number_of_frames / fps)
    
    #define list of frames to analyze. 
    list_of_frames = list(range(1, 9001, 30))  #max 5 minutes of video. #starts at frame 1, ends at frame 601 (which is 20 seconds at FPS = 30 and 10 seconds at FPS = 60) with 30 frames in between (= 1 second). So, takes 20 (= 21 seconds) frames from the video

    #loop through the video and cut into images
    currentframe = 0

    while(True):
        
        for frame in list_of_frames:
            cam.set(cv2.CAP_PROP_POS_FRAMES, frame)
            print("Analyze frame number " + str(frame))

            # reading from frame 
            ret,frame = cam.read()

            if ret:
                if currentframe < 10:   
                    name = path_split_images + '/frame000' + str(currentframe) + '.jpg'           

                elif currentframe >= 10 and currentframe < 100:   
                    name = path_split_images + '/frame00' + str(currentframe) + '.jpg'          

                elif currentframe >= 100 and currentframe < 1000:   
                    name = path_split_images + '/frame0' + str(currentframe) + '.jpg'   

                else:
                    name = path_split_images + '/frame' + str(currentframe) + '.jpg'      

                print ('Creating...' + name) 

                # writing the extracted images 
                cv2.imwrite(name, frame) 

                # increasing counter
                currentframe += 1
            
        else: 
            break

    cam.release()    
    
    #apply DeepFace to the images
    try:
        dfs = DeepFace.find(img_path = profile_image_loc, db_path = "/home/datascience/split_images", enforce_detection=False)  #first input is the profile image, second is the folder containing the split images
        
    except:
        pass #if no face is found in any of the images
    
    #get the dataframe of the results
    output_df = dfs[0]
    
    ########## calculations
    seconds_in_screen = output_df.shape[0]  # = total frames detected = frames per second as we are looping through each frame
    total_seconds_video_analyzed = total_duration_video
    
    #in screen calc
    list_in_screen = []
    if seconds_in_screen > 0:
        in_screen = 'Person was in video'
        print(in_screen)
        list_in_screen.append(in_screen)
    else:
        in_screen = 'Person was not in video'
        list_in_screen.append(in_screen)
        print(in_screen)
        
    output_in_screen = list_in_screen[0]
    
    print("**************************************************************** Person was or was not in screen is :" + str(output_in_screen))
    print("**************************************************************** Total seconds analyzed of entire video " + str(len(list_of_frames)) + " seconds")
    print("**************************************************************** This person was " + str(seconds_in_screen) + " seconds in screen")
    
    return output_in_screen, seconds_in_screen, total_seconds_video_analyzed


Overwriting /home/datascience/job_artifacts/sub_packages/video_only.py


## **1.2 Test Script video_only.py**

In [None]:
main_bucket_name = "West_BP"
YOUTUBE_URL = "https://www.youtube.com/shorts/ugwUcdtygok"
SCHEMA_NAME = "ocw"
full_bucket_name = "oci://West_BP@frqap2zhtzbe/ocw"
output_in_screen, seconds_in_screen, total_seconds_video_analyzed  = predict_video(main_bucket_name, namespace, YOUTUBE_URL, BUCKET_NAME, SCHEMA_NAME)

---

# **2. Audio_only.py**

## **2.1 Script Audio_only.py**

In [5]:
%%writefile /home/datascience/ocw_las_vegas/job_artifacts/sub_packages/audio_only.py
import os
import numpy as np
import pandas as pd 
import uuid
import glob      
import ocifs
import base64
import io
import matplotlib.pyplot as plt
import cv2
import fsspec
from PIL import Image
from io import BytesIO
from ads.model.framework.tensorflow_model import TensorFlowModel
from ads.common.model_metadata import UseCaseType
from ads.common.model_artifact import ModelArtifact
from ads.common.model_export_util import prepare_generic_model
from pytube import YouTube
import oci
import json
import time
from oci.object_storage import ObjectStorageClient
from oci.ai_language import AIServiceLanguageClient
from oci.ai_language.models import DetectLanguageKeyPhrasesDetails
from oci.ai_language.models import DetectLanguageSentimentsDetails
from oci.ai_speech import AIServiceSpeechClient
from oci.ai_speech.models import TranscriptionModelDetails
from oci.ai_speech.models import ObjectLocation
from oci.ai_speech.models import ObjectListInlineInputLocation
from oci.ai_speech.models import OutputLocation
from oci.ai_speech.models import CreateTranscriptionJobDetails

##########################################################################################################################################
######################################################## Function 1          #############################################################
##########################################################################################################################################

def input_youtube_video_audio(YOUTUBE_URL):
    
    #delete previous videos
    os.system("rm -r /home/datascience/youtube_videos_audio")

    #create a local directory to store the video
    path_input_locally = "/home/datascience/youtube_videos_audio/" 

    try:       
        if not os.path.exists(path_input_locally):         
            os.makedirs(path_input_locally)    

    except OSError: 
        print ('Error: Creating directory for youtube audio locally')
        

    #download file from youtube
    yt = YouTube(YOUTUBE_URL)

    #store in local folder
    stream = yt.streams.get_by_itag(139)  #139 is audio only
    file_name_random = str(uuid.uuid4())
    file_location_local_audio = stream.download(output_path=path_input_locally, filename  = file_name_random + ".mp4")
    
    print("Youtube download for audio only completed and stored in " + str(file_location_local_audio))
    
    return file_location_local_audio

##########################################################################################################################################
######################################################## Function 2          #############################################################
##########################################################################################################################################

def audio_to_object_storage(bucket_name_input, namespace_input, config, name, file_location_local):
    
    #Object Storage Client
    client_object_storage = ObjectStorageClient(config)
    
    #Define bucket and namespace
    bucket_name = bucket_name_input
    namespace = namespace_input

    #Audio to Bucket
    response = client_object_storage.put_object(namespace, bucket_name, name, io.open(file_location_local, 'rb'), content_type='audio/wav')

##########################################################################################################################################
######################################################## Function 3          #############################################################
##########################################################################################################################################

def run_speech_model(bucket_name_input, namespace_input, compartment_id_input, config, name):
    
    # Instantiate Speech Client
    ai_speech_client = AIServiceSpeechClient(config)
    
    # Define Parameters for Transcription Jobs
    job_display_name = "Offensive_Language_Detection"
    job_compartment_id = compartment_id_input
    job_description = "Offensive_Language_Detection"
    bucket_name = bucket_name_input
    namespace = namespace_input
    output_prefix = "speech_out_"
    

    # Define Transcription Job - Model, Data, Input, Outputs
    job_model_details = TranscriptionModelDetails(domain="GENERIC", language_code="en-GB")
    job_object_location = ObjectLocation(namespace_name=namespace, bucket_name=bucket_name,object_names=[name])
    job_input_location = ObjectListInlineInputLocation(location_type="OBJECT_LIST_INLINE_INPUT_LOCATION", object_locations=[job_object_location])
    job_output_location = OutputLocation(namespace_name=namespace, bucket_name=bucket_name, prefix=output_prefix)

    
    # Create Transcription Job with details provided above
    transcription_job_details = CreateTranscriptionJobDetails(display_name=job_display_name,
                                                                compartment_id=job_compartment_id,
                                                                description=job_description,
                                                                model_details=job_model_details,
                                                                input_location=job_input_location,
                                                                output_location=job_output_location)

    
    # Call the AI Speech Service to Create Transcription Job 
    transcription_job = None
    try:
        transcription_job = ai_speech_client.create_transcription_job(create_transcription_job_details=transcription_job_details)
    except Exception as e:
        print(e)
    else:
        print(transcription_job.data.lifecycle_state)
        
    # Pause for 3 Seconds to Allow Job to be Accepted
    time.sleep(3)
    
    # Gets the First Transcription Tasks under given Transcription Job Id then Extracts Info for that Task
    transcription_tasks = None
    try:
        # Get Tasks Under Job
        transcription_tasks = ai_speech_client.list_transcription_tasks(transcription_job.data.id, limit=1)
        
        # Keep Checking until Task is Succeeded
        while transcription_tasks.data.items[0].lifecycle_state != 'SUCCEEDED':
            print('Transcribing in Progress...')
            time.sleep(5)
            transcription_tasks = ai_speech_client.list_transcription_tasks(transcription_job.data.id, limit=1)
            
        # Once Task is Succeeded Extract Task Info
        transcription_task = ai_speech_client.get_transcription_task(transcription_job.data.id, transcription_tasks.data.items[0].id)
        
    except Exception as e:
        print(e)
        
    else:
        print(transcription_tasks.data.items[0].lifecycle_state)
        print(transcription_task.data.output_location.object_names[0])
    
    # Extract Results File Name from Task Info Response
    object_name = transcription_task.data.output_location.object_names[0]
    
    return object_name


##########################################################################################################################################
######################################################## Function 4          #############################################################
##########################################################################################################################################

def parse_results(bucket_name_input, namespace_input, config, object_name):
    
    # Instantiate Object Storage Client
    client = ObjectStorageClient(config)
    
    # Define Parameters
    bucket_name = bucket_name_input
    namespace = namespace_input
    
    # Get Speech Results File from Object Storage
    response = client.get_object(namespace, bucket_name, object_name)
    
    # Decode Results from File
    decoded_resp = json.loads(response.data.content.decode())
    
    # Extract Transcription from Results
    transcription_out = decoded_resp['transcriptions'][0]['transcription']
    print(transcription_out)
    return transcription_out

##########################################################################################################################################
######################################################## Function 5          #############################################################
##########################################################################################################################################

def run_language_models(config, transcription_out):
    
    # Initialize Service Client to Language API
    ai_language_client = AIServiceLanguageClient(config)
    
    
    # Make a REST API Request to AI Language Service to Detect Key Phrases
    language_key_phrases = ai_language_client.detect_language_key_phrases(
        detect_language_key_phrases_details=DetectLanguageKeyPhrasesDetails(text = transcription_out))
    
    # Results List
    key_phrase_results = []
    
    # Extract Language Entities
    formatted_response = language_key_phrases.data.key_phrases
    
    # Iterate through and Store Entites in Results List
    for key_phrase in formatted_response:
        key_phrase_results.append(key_phrase.text)
        
    #merge key phrases extracted from list in one string: 
    key_phrases_string_out = 'input: '
    for key_phrase in key_phrase_results:
        key_phrases_string_out += key_phrase + ', '
        
    ##maximuze input for sentiment and for push to db
    key_phrases_string = key_phrases_string_out[0:498]  #filter on max 500 characters

    # Make a REST API Request to AI Language Service to Detect Sentiments
    language_sentiment_response = ai_language_client.detect_language_sentiments(
        detect_language_sentiments_details=DetectLanguageSentimentsDetails(text = key_phrases_string))
    
    # Results List
    sentiment_results = []
    
    # Extract Language Sentiments
    formatted_response = language_sentiment_response.data.aspects
    
    # Iterate through and Store Aspect Sentiment in Results List
    for aspect in formatted_response:
        sentiment_results.append((aspect.text, aspect.sentiment))

    return key_phrases_string, sentiment_results


##########################################################################################################################################
######################################################## Predict             #############################################################
##########################################################################################################################################


def predict_audio(YOUTUBE_URL, MAIN_BUCKET_NAME, NAMESPACE_NAME, compartment_id_input):
    
    main_bucket_name = MAIN_BUCKET_NAME
    namespace = NAMESPACE_NAME
        
    # Authenticate against OCI 
    config = oci.config.from_file('config', 'DEFAULT')                                                       #### ****** -------- In Job
    #config = oci.config.from_file('/home/datascience/.oci/config', 'DEFAULT')                               #### ****** -------- In notebook
    
    #Function 1. Download YouTube video as recording
    file_location_local_audio = input_youtube_video_audio(YOUTUBE_URL)
    
    # Send Audio File to Object storage
    name = 'offensive_language.wav'
    audio_to_object_storage(main_bucket_name, namespace, config, name, file_location_local_audio) #passing fixed main bucket name and namespace. These are in main.py defined
    
    # Run Speech Model - Returns Results object
    object_name = run_speech_model(main_bucket_name, namespace, compartment_id_input, config, name)
    
    # Get Results File from Object Storage and Parse Transcription
    transcription_out = parse_results(main_bucket_name, namespace, config, object_name)
    transcription = transcription_out[0:498]  #filter on max 500 characters
    
    # Run Language Models on Transcription to Get Key Phrases and Sentiment
    key_phrases_string, sentiment_results = run_language_models(config, transcription_out)
    
    
    #convert list to string
    sentiment_result_string_output = 'input: '
    for sentimentx in sentiment_results:
        sentiment_result_string_output += sentimentx[0] + ', '
    
    sentiment_result_string = sentiment_result_string_output[0:498]  #filter on max 500 characters
        
    # Count Negative Aspects
    neg_aspects = 0 
    
    for sentiment in sentiment_results:
        if sentiment[1] == 'Negative':
            neg_aspects += 1

    
    return transcription, key_phrases_string, sentiment_result_string, neg_aspects

Overwriting /home/datascience/ocw_las_vegas/job_artifacts/sub_packages/audio_only.py


## **2.2 Test Script Audio_only.py**

In [None]:
#below are added to main.py
main_bucket_name = "West_BP"
namespace = "frqap2zhtzbe"
compartment_id_input = "ocid1.compartment.oc1..aaaaaaaae3n6r6hrjipbap2hojicrsvkzatrtlwvsyrpyjd7wjnw4za3m75q"


YOUTUBE_URL = "https://www.youtube.com/shorts/05ldl6tfJ78" 
#other example: https://www.youtube.com/shorts/Y-PBRyEz4xY

transcription, key_phrases_string, sentiment_result_string, neg_aspects = predict_audio(YOUTUBE_URL, main_bucket_name, namespace, compartment_id_input)

---

# **3. Main.py**

## **3.1 Script main.py**

In [6]:
%%writefile /home/datascience/ocw_las_vegas/job_artifacts/main.py

### input variables for start of workshop
####################################
#################################### CHANGE THE BELOW PARAMETERS
####################################
####################################

password = ""                            # Database password for user name
wallet_name = ""                         # The name of wallet or database, like "DB202112101358", excluding "Wallet_" and excluding ".zip"
compartment_id_input = ""                # OCID of the comparment













####################################
#################################### DO NOT CHANGE THE BELOW.
####################################
####################################

wallet_storage_directory = './wallet'           

#imports
import os
import numpy as np
import pandas as pd 
from deepface import DeepFace
import uuid
import glob      
import ocifs
import base64
import io
import matplotlib.pyplot as plt
import cv2
import fsspec
from PIL import Image
from io import BytesIO
from pytube import YouTube
import numpy as np
import pandas as pd 
import oci
import json
import ocifs
import time
from oci.object_storage import ObjectStorageClient
from oci.ai_language import AIServiceLanguageClient
from oci.ai_language.models import DetectLanguageKeyPhrasesDetails
from oci.ai_language.models import DetectLanguageSentimentsDetails
from oci.ai_speech import AIServiceSpeechClient
from oci.ai_speech.models import TranscriptionModelDetails
from oci.ai_speech.models import ObjectLocation
from oci.ai_speech.models import ObjectListInlineInputLocation
from oci.ai_speech.models import OutputLocation
from oci.ai_speech.models import CreateTranscriptionJobDetails
import sqlalchemy
from sqlalchemy import create_engine
import cx_Oracle
import ads

#import custom py files
from sub_packages.video_only import input_youtube_video
from sub_packages.video_only import input_profile_image
from sub_packages.video_only import predict_video
from sub_packages.audio_only import input_youtube_video_audio
from sub_packages.audio_only import audio_to_object_storage
from sub_packages.audio_only import run_speech_model
from sub_packages.audio_only import parse_results
from sub_packages.audio_only import run_language_models
from sub_packages.audio_only import predict_audio
from sub_packages.credentials import create_uri
from sub_packages.roberta import preprocess
from sub_packages.roberta import roberta_base
from sub_packages.check_table import check_table

print("Main imports done")

#fetch environment variables from APEX input. The below are default values.
TYPE_OF_ANALYSIS = os.environ.get("TYPE_OF_ANALYSIS", "both")
YOUTUBE_URL = os.environ.get("YOUTUBE_URL", "https://www.youtube.com/shorts/ugwUcdtygok")
MAIN_BUCKET_NAME = os.environ.get("MAIN_BUCKET_NAME", "West_BP")
NAMESPACE_NAME = os.environ.get("NAMESPACE_NAME", "frqap2zhtzbe")
#SUB_BUCKET_NAME = os.environ.get("SUB_BUCKET_NAME", "las_vegas")
SCHEMA_NAME = os.environ.get("SCHEMA_NAME", "test")

#Get job run ocid
JOB_RUN_OCID_KEY = "JOB_RUN_OCID"
job_run_ocid = os.environ.get(JOB_RUN_OCID_KEY, "UNDEFINED")

print("Type of analysis is " + TYPE_OF_ANALYSIS)
print("Youtube URL is " + YOUTUBE_URL)
print("Bucket name is " + MAIN_BUCKET_NAME)
print("Schema name " + SCHEMA_NAME)

#get full bucket, namespace, sub_bucket directory
full_bucket_name = "oci://"+MAIN_BUCKET_NAME+"@"+NAMESPACE_NAME+"/"+SCHEMA_NAME+"/"

#route type of analysis

if TYPE_OF_ANALYSIS == 'video_only':
    output_in_screen, seconds_in_screen, total_seconds_video_analyzed = predict_video(full_bucket_name, YOUTUBE_URL, SCHEMA_NAME)
    transcription = 'no_transcription'
    key_phrases_string = 'no_result'
    sentiment_result_string = 'no_result'
    neg_aspects = 0
    non_offensive = 0   #default values
    offensive = 0
    non_hate = 0
    hate = 0
    
elif TYPE_OF_ANALYSIS == 'audio_only':
    transcription, key_phrases_string, sentiment_result_string, neg_aspects = predict_audio(YOUTUBE_URL, MAIN_BUCKET_NAME, NAMESPACE_NAME, compartment_id_input)
    new_text = preprocess(transcription)
    non_offensive, offensive, non_hate, hate = roberta_base(new_text)
    output_in_screen = "no_video"
    seconds_in_screen = 0
    total_seconds_video_analyzed = 0

elif TYPE_OF_ANALYSIS == 'both':
    output_in_screen, seconds_in_screen, total_seconds_video_analyzed = predict_video(full_bucket_name, YOUTUBE_URL, SCHEMA_NAME)  #video
    transcription, key_phrases_string, sentiment_result_string, neg_aspects = predict_audio(YOUTUBE_URL, MAIN_BUCKET_NAME, NAMESPACE_NAME, compartment_id_input)  #audio
    new_text = preprocess(transcription)   # cleans text for input roberta model
    non_offensive, offensive, non_hate, hate = roberta_base(new_text)   #roberta model
    
else: #default values if job fails
    output_in_screen = "unclear"
    seconds_in_screen = 0
    total_seconds_video_analyzed = 0
    transcription = 'no_transcription'
    key_phrases_string = 'no_result'
    sentiment_result_string = 'no_result'
    neg_aspects = 0
    non_offensive = 1   #default values
    offensive = 1
    non_hate = 1
    hate = 1

#change floats to integers
non_offensive_int = int(round((non_offensive * 100),0))
offensive_int = int(round((offensive * 100),0))
non_hate_int = int(round((non_hate * 100),0))
hate_int = int(round((hate * 100),0))
    
    
#Construct all variables in a single line as PD dataframe
data = [[job_run_ocid, TYPE_OF_ANALYSIS, output_in_screen, seconds_in_screen, total_seconds_video_analyzed, 
         transcription, key_phrases_string, sentiment_result_string, neg_aspects, 
        non_offensive_int, offensive_int, non_hate_int, hate_int]]

df_resultsx = pd.DataFrame(data, columns = ['job_run_ocid', 'type_of_analysis', 'output_in_screen', 'seconds_in_screen', 'total_seconds_video_analyzed',
                                           'transcription', 'key_phrases_string', 'sentiment_result_string', 'neg_aspects',
                                           'non_offensive_int', 'offensive_int', 'non_hate_int', 'hate_int'])



print("End of audio and video. Results are printed below")
print(seconds_in_screen)
print(total_seconds_video_analyzed)
print(transcription)
print(key_phrases_string)
print(sentiment_result_string)
    
#### create uri using credentials.py and create engine
engine, wallet_filename = create_uri(SCHEMA_NAME, password, wallet_name, wallet_storage_directory)                                                                #------------------xxxxxxxxxxxxxxxxxx CHANGE user_name in notebook, SCHEMA_NAME in Job
#engine = create_uri(user_name, password, wallet_name, wallet_storage_directory)   
print(engine)

#check whether table exists already or not. Will be 'replace' or 'append'
table_status = check_table(SCHEMA_NAME, password, wallet_name, wallet_filename)

## push results to database
df_resultsx.to_sql('ocw_run_results', con=engine, index=False, if_exists=table_status, dtype={
    
            'job_run_ocid': sqlalchemy.types.NVARCHAR(length=500),    ## from video
            'type_of_analysis': sqlalchemy.types.NVARCHAR(length=500),
            'output_in_screen': sqlalchemy.types.NVARCHAR(length=500),
            'seconds_in_screen': sqlalchemy.types.INTEGER(),
            'total_seconds_video_analyzed': sqlalchemy.types.INTEGER(),
    
            'transcription': sqlalchemy.types.NVARCHAR(length=500),     ## from audio
            'key_phrases_string': sqlalchemy.types.NVARCHAR(length=500),
            'sentiment_result_string': sqlalchemy.types.NVARCHAR(length=500),
            'neg_aspects': sqlalchemy.types.INTEGER(),
    
            'non_offensive_int': sqlalchemy.types.INTEGER(), #roberta
            'offensive_int': sqlalchemy.types.INTEGER(), 
            'non_hate_int': sqlalchemy.types.INTEGER(), 
            'hate_int': sqlalchemy.types.INTEGER() })


print("Table replaced and updated with results")
print("----------- The end -----------------")

Overwriting /home/datascience/ocw_las_vegas/job_artifacts/main.py


## **Testing**

## **3.2 Test Script main.py**

In [None]:
!python main.py

---

# **4. roberta.py**

## **4.1 Script roberta.py**

In [7]:
%%writefile /home/datascience/ocw_las_vegas/job_artifacts/sub_packages/roberta.py
import tensorflow as tf
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import os

def preprocess(transcription):
        new_text = []
        for t in transcription.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
        return " ".join(new_text)
    
    
def roberta_base(new_text):
    
    text = new_text

    #delete models if there (each Job run should not have, but to be sure)
    os.system("rm -r ./cardiffnlp/twitter-roberta-base-offensive")
    os.system("rm -r ./cardiffnlp/twitter-roberta-base-hate")

    #define offensive and hate model
    MODEL_OFFENSIVE = f"cardiffnlp/twitter-roberta-base-offensive"
    MODEL_HATE = f"cardiffnlp/twitter-roberta-base-hate"
    
    #load tokenizers
    tokenizer_offensive = AutoTokenizer.from_pretrained(MODEL_OFFENSIVE)
    tokenizer_hate = AutoTokenizer.from_pretrained(MODEL_HATE)

    #label mapping
    labels_offensive=[]
    mapping_link_offensive = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/offensive/mapping.txt"
    with urllib.request.urlopen(mapping_link_offensive) as f:
        html_offensive = f.read().decode('utf-8').split("\n")
        csvreader_offensive = csv.reader(html_offensive, delimiter='\t')
    labels_offensive = [row[1] for row in csvreader_offensive if len(row) > 1]

    labels_hate=[]
    mapping_link_hate = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/hate/mapping.txt"
    with urllib.request.urlopen(mapping_link_hate) as f:
        html_hate = f.read().decode('utf-8').split("\n")
        csvreader_hate = csv.reader(html_hate, delimiter='\t')
    labels_hate = [row[1] for row in csvreader_hate if len(row) > 1]       
    
    # PT
    model_offensive = AutoModelForSequenceClassification.from_pretrained(MODEL_OFFENSIVE)
    model_offensive.save_pretrained(MODEL_OFFENSIVE)
    
    model_hate = AutoModelForSequenceClassification.from_pretrained(MODEL_HATE)
    model_hate.save_pretrained(MODEL_HATE)
    
    #tokenizer text
    encoded_input_offensive = tokenizer_offensive(text, return_tensors='pt')
    encoded_input_hate = tokenizer_hate(text, return_tensors='pt')
    
    output_offensive = model_offensive(**encoded_input_offensive)
    output_hate = model_hate(**encoded_input_hate)
        
    scores_offensive = output_offensive[0][0].detach().numpy()
    scores_hate = output_hate[0][0].detach().numpy()
    
    scores_offensive = softmax(scores_offensive)
    scores_hate = softmax(scores_hate)

    non_offensive = scores_offensive[0]
    offensive = scores_offensive[1]
    
    non_hate = scores_hate[0]
    hate = scores_hate[1]

    print("non_offensive score = " + str(non_offensive))
    print("offensive score = " + str(offensive))
    print("non_hate score = " + str(non_hate))
    print("hate score = " + str(hate))
    
    return non_offensive, offensive, non_hate, hate

Overwriting /home/datascience/ocw_las_vegas/job_artifacts/sub_packages/roberta.py


## **4.2 Test script roberta.py**

In [26]:
transcription = "Please leave NOWWW"

new_text = preprocess(transcription)

non_offensive, offensive, non_hate, hate = roberta_base(new_text)

non_offensive score = 0.76836103
offensive score = 0.23163892
non_hate score = 0.597862
hate score = 0.40213794


# **5. Credentials.py**

## **5.1 Script Credentials.py**

In [108]:
%%writefile /home/datascience/ocw_las_vegas/job_artifacts/sub_packages/credentials.py
import ads
import os
import configparser
import shutil
from zipfile import ZipFile
from tempfile import NamedTemporaryFile
import urllib
import re
import sqlalchemy
from sqlalchemy import create_engine
import cx_Oracle

def create_uri(user_name, password, wallet_name, wallet_storage_directory):
    
    database_name = wallet_name
    database_user = user_name
    database_password = password
    
    wallet_storage_directory = wallet_storage_directory

    # Create the wallet directory if missing: 
    ads.set_documentation_mode(False)

    os.makedirs(wallet_storage_directory, mode=0o700, exist_ok=True)

    wallet_path = os.path.join(wallet_storage_directory, database_name)

    # Prepare to store ADB connection information
    adb_config = os.path.join(wallet_storage_directory, '.credentials')

    # Write a configuration file for login creds.
    config = configparser.ConfigParser()
    config.read(adb_config)
    config[database_name] = {'tns_admin': wallet_path,
                             'sid': '{}_medium'.format(database_name.lower()),
                             'user': database_user,
                             'password': database_password}
    with open(adb_config, 'w') as configfile:
        config.write(configfile)


    # Read in the credentials configuration files
    my_config = configparser.ConfigParser()
    my_config.read(adb_config)

    # Access a setting
    print(my_config[database_name].get('user'))

    # Limit the information to a specific database
    my_creds = my_config[database_name]
    print(my_creds.get('user'))


    # extract the wallet
    wallet_file = 'Wallet_{}.zip'.format(database_name)
    wallet_filename = os.path.join(wallet_storage_directory, wallet_file)
    if not os.path.exists(wallet_filename):
        print("The file {} does not exist.".format(wallet_filename))
        print("Please copy the Wallet file, {}, into the directory {} then rerun this cell.".format(wallet_file, wallet_filename))
    else:
        os.makedirs(wallet_path, mode=0o700, exist_ok=True)
        with ZipFile(wallet_filename, 'r') as zipObj:
            zipObj.extractall(wallet_path)


    # Update the sqlnet.ora

    sqlnet_path = os.path.join(wallet_path, 'sqlnet.ora')
    sqlnet_original_path = os.path.join(wallet_path, 'sqlnet.ora.original')
    sqlnet_backup_path = os.path.join(wallet_path, 'sqlnet.ora.backup')
    if not os.path.exists(sqlnet_original_path):
        shutil.copy(sqlnet_path, sqlnet_original_path)
    if os.path.exists(sqlnet_path):
        shutil.copy(sqlnet_path, sqlnet_backup_path)
    sqlnet_re = re.compile('(WALLET_LOCATION\s*=.*METHOD_DATA\s*=.*DIRECTORY\s*=\s*\")(.*)(\".*)', 
                           re.IGNORECASE)
    tmp = NamedTemporaryFile()
    with open(sqlnet_path, 'rt') as sqlnet:
        for line in sqlnet:
            tmp.write(bytearray(sqlnet_re.subn(r'\1{}\3'.format(wallet_path), line)[0], 
                                encoding='utf-8'))
    tmp.flush()
    shutil.copy(tmp.name, sqlnet_path)
    tmp.close()

    # Add TNS_ADMIN to the environment
    os.environ['TNS_ADMIN'] = config[database_name].get('tns_admin')

    # Test the database connection
    creds = config[database_name]
    connect = 'sqlplus ' + creds.get('user') + '/' + creds.get('password') + '@' + creds.get('sid')
    print(os.popen(connect).read())

    # Get the URI to connect to the database
    uri='oracle+cx_oracle://' + creds.get('user') + ':' + creds.get('password') + '@' + creds.get('sid')
    
    engine = create_engine(uri)

    return engine, wallet_filename

Overwriting /home/datascience/job_artifacts/sub_packages/credentials.py


---

# **6. Script check_table.py**

In [8]:
%%writefile /home/datascience/ocw_las_vegas/job_artifacts/sub_packages/check_table.py

def check_table(SCHEMA_NAME, password, wallet_name, wallet_filename):
    
    import ads
    service_name = wallet_name.lower() + "_high"
    #wallet_location = wallet_storage_directory + "/" + "Wallet_" + wallet_name + ".zip"
    
    print("service name is in check_table " + service_name)
    print("wallet location is in check_table " + wallet_filename)

    creds = {"user_name": SCHEMA_NAME,
        "password":  password,
        "service_name": service_name,
        "wallet_location": wallet_filename}
    
    print(creds)
    
    try:
        check_table_exists = pd.DataFrame.ads.read_sql("SELECT COUNT(*) AS CHECKX FROM ocw_run_results", connection_parameters=creds)
        checkx = check_table_exists['CHECKX'][0]  #checkx will be '1' in table exits
        print("Table already exist, so append table")
        table_status = 'append'            

    except:
        table_status = 'replace'
        print("Table status is replace")

    return table_status

Overwriting /home/datascience/ocw_las_vegas/job_artifacts/sub_packages/check_table.py


In [None]:
table_status = check_table(SCHEMA_NAME, password, wallet_name, wallet_storage_directory)
table_status

# **The End**