In [7]:
# Libraries

#Connect to mysql
import mysql.connector

#Operating system
import os
import sys
from pathlib import Path
#import subprocess
from datetime import timedelta

#Transcription & subtitles
import whisper
import stable_whisper

#Translation
import translators as ts
import translators.server as tss
import textwrap

#Pandas
import pandas as pd


Using state Galicia server backend.


In [9]:
def connection_ddbb():
    """Function to create the conection to the data base
    
    Keyword arguments:    
    Return: connection objet, secrets object
    """
       
    secrets={}
    #secrets_file = open('secrets.txt','r') #Had errors using this within a virtual environment
    secrets_file = os.fdopen(os.open('secrets.txt', os.O_RDONLY))
    for line in secrets_file:
        (key, val) = line.replace('\n','').split("|")
        secrets[key] = val    

    #Conection to mysql

    conn = mysql.connector.connect(user=secrets['user'],
                            password=secrets['pass'],
                            host=secrets['server'])
    
    return conn, secrets

In [10]:
connection, secrets = connection_ddbb()

In [3]:
def create_ddbb(_conn,_secrets):
    """Function to create the data base
    
    Keyword arguments:  
    _conn: connection object 
    _secrets: secrets object 
    Return: None
    """
    
    #connection_ddbb
    
    #Creating schema in mysql

    if _conn.is_connected():
        cursor = _conn.cursor()

        print('Connection open')        
        
        print('Creating database if necessary...')
        
        query = ('CREATE DATABASE IF NOT EXISTS ironrep')
        
        cursor.execute(query)
        _conn.commit()
        
        query = ('USE ironrep')
        
        cursor.execute(query)
        _conn.commit()
        
        print('Database created if necessary...')

        print('Creating tables if necessary...')

        #Configuration        
        
        query = ("""CREATE TABLE IF NOT EXISTS ironrep.configuration (  
                        id enum('1') PRIMARY KEY NOT NULL,
                        temp_directory  nvarchar(250),
                        video_player nvarchar(250),
                        languages_subtitles nvarchar(250) COMMENT 'List of langages codes separated by commas')""")

        cursor.execute(query)    
        _conn.commit()

        #Default values
        query = ("""REPLACE INTO ironrep.configuration (temp_directory,video_player,languages_subtitles)
                        VALUES (%s,%s,%s)""")
        val = (str(_secrets['temp_dir']),str(_secrets['video_play']),str(_secrets['lang_subt']))
        """val = (str('/home/roque/01. IronHack/00. Data Analytics/01. Course/63. Week 23 - Day 3/git/final-project-bootcamp/data/'),
                str("vlc '{videoparam}' --sub-file '{subtitleparam}' --no-sub-autodetect-file --start-time '{positionparam}'"),
                str('es,pt,it,zh,de,hi'))"""
        cursor.execute(query,val)
        _conn.commit()

        #Videos

        query = ("""CREATE TABLE IF NOT EXISTS ironrep.videos (
                        id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
                        video nvarchar(250),
                        video_path nvarchar(250)
                    )"""
        )

        cursor.execute(query)    
        _conn.commit()

        #Transcriptions

        query = ("""CREATE TABLE IF NOT EXISTS ironrep.transcriptions (
                        id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
                        videoid INT NOT NULL,
                        languageid nvarchar(5),
                        transcription mediumtext
                    )"""
        )

        cursor.execute(query)    
        _conn.commit()

        #Summaries

        query = ("""CREATE TABLE IF NOT EXISTS ironrep.summaries (
                        videoid INT NOT NULL,
                        languageid nvarchar(5),
                        summary nvarchar(260)                   
                    )"""
        )

        cursor.execute(query)    
        _conn.commit()

        #Subtitles

        query = ("""CREATE TABLE IF NOT EXISTS ironrep.subtitles (
                        id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
                        videoid INT NOT NULL,
                        languageid nvarchar(5),
                        subtitles mediumtext                   
                    )"""
        )

        cursor.execute(query)    
        _conn.commit()

        #Keywords

        query = ("""CREATE TABLE IF NOT EXISTS ironrep.keywords (
                        id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
                        videoid INT NOT NULL,
                        languageid nvarchar(5),
                        keywords nvarchar(250)                   
                    )"""
        )

        cursor.execute(query)    
        _conn.commit()

        print('Tables created if necessary...')
    else:
        print('Error connecting')


    

In [5]:
create_ddbb(connection, secrets)


Connection open
Creating database if necessary...
Database created if necessary...
Creating tables if necessary...
Tables created if necessary...


In [6]:
def insert_data_sql(_conn, _table, _videoid, _langid, _text):
    """Function to insert data in the database
    
    Keyword arguments:
    _table: the object where insert ['video','transcription','subtitle','summary','keywords']
    _videoid: the id of the video
    _langid: the id of the language
    _text: value to insert
    Return: cursor
    """
    
    if _conn.is_connected():
        cursor = _conn.cursor()
        query = ''
        val = []

        if (_table == 'video'):                 
            vid_path = os.path.split(os.path.abspath(Path(_text)))

            query = """INSERT INTO ironrep.videos(video,video_path)
                        VALUES (%s,%s)"""
            val = [vid_path[1].split('.')[0],_text]
        elif (_table == 'transcription'):
            query = """INSERT INTO ironrep.transcriptions(videoid,languageid,transcription)
                        VALUES (%s,%s,%s)"""
            val = [int(_videoid),_langid,_text]
        elif (_table == 'subtitle'):
            query = """INSERT INTO ironrep.subtitles(videoid,languageid,subtitles)
                        VALUES (%s,%s,%s)"""
            val = [int(_videoid),_langid,_text]
        elif (_table == 'summary'):
            query = """INSERT INTO ironrep.summary(videoid,languageid,summary)
                        VALUES (%s,%s,%s)"""
            val = [int(_videoid),_langid,_text]
        elif (_table == 'keywords'):
            query = """INSERT INTO ironrep.keywords(videoid,languageid,keywords)
                        VALUES (%s,%s,%s)"""
            val = [int(_videoid),_langid,_text]
        else:
            return 'none'

        if (query != ''):      
            cursor.execute(query,val)
            _conn.commit()  
            return cursor

In [8]:
def transcribe(_conn, filepath):
    """
    With this function we can transcribe all the texts from a video/audio and also the subtitles
    
    Keyword arguments:
    argument -- filepath:the file to transcribe
    Return: No return (insert in mysql and create 2 text files -temporary-)
    """
    
    video = os.path.split(os.path.abspath(Path(filepath)))
    name = video[1].split(sep='.')

    result_sql = insert_data_sql(_conn,'video', '', '', filepath)
    videoid = result_sql.lastrowid
    
    # speech transcription
    
    model = whisper.load_model("base.en",device='cpu')
    #model = stable_whisper.load_model('base')

    result = model.transcribe(filepath)
    
    with open(Path(video[0]+"/"+name[0]+"_transcription.txt"), "w+") as f:
        f.write(result["text"])

    result_sql = insert_data_sql(_conn,'transcription', videoid, 'en', result['text'])

    # subtitles 
    stable_whisper.results_to_sentence_srt(result, video[0]+"/"+name[0]+"_subtitles.srt")

    text_subtitles = open(video[0]+"/"+name[0]+"_subtitles.srt").read()

    result_sql = insert_data_sql(_conn,'subtitle', videoid, 'en', text_subtitles)
    


In [18]:
def translate_subtitles(_conn, _videoid):
    if _conn.is_connected():

        cursor_conf = connection.cursor(buffered=True)

        query_conf = """SELECT languages_subtitles, temp_directory 
                        FROM ironrep.configuration
                        LIMIT 1;"""
        cursor_conf.execute(query_conf)

        conf_table = cursor_conf.fetchall()
        conf_df = pd.DataFrame(conf_table)
        conf_df.columns = [i[0] for i in cursor_conf.description]

        cursor = connection.cursor(buffered=True)
        query = """SELECT videoid, languageid, subtitles
                    FROM ironrep.subtitles
                    WHERE videoid = %s and languageid = %s"""
        val = [int(_videoid),str('en')]
        cursor.execute(query,val)
        
        subt_table = cursor.fetchall()
        subt_df = pd.DataFrame(subt_table)
        subt_df.columns = [i[0] for i in cursor.description]
        
        translated = []
        text_subtitles = ''

        for lang in [language for languages in conf_df['languages_subtitles'].str.split(',') for language in languages]:
            for sub in subt_table:
                print(lang)
                #for row in sub[2].split('\n'):
                #    translated.append(translate_from_en(row,lang))
                    #translated.append(row)
                
            with open(Path(list(conf_df['temp_directory'])[0]+"/"+lang+"_subtitle_tmp.srt"), mode='wt', encoding='utf-8') as f:    
                f.write('\n'.join(translated))

            text_subtitles = open(Path(list(conf_df['temp_directory'])[0]+"/"+lang+"_subtitle_tmp.srt")).read().replace(' -> ',' --> ').replace(': ',':')

            result_sql = insert_data_sql(connection,'subtitle', _videoid, lang, text_subtitles)

            os.remove(Path(list(conf_df['temp_directory'])[0]+"/"+lang+"_subtitle_tmp.srt"))

In [17]:
translate_subtitles(connection, 1)

es
pt
it
zh
de
hi


In [43]:
def video_player(_conn,_videoid, _langid, _position = 0):
    """Function to launch the video with subtitles
    
    Keyword arguments:
    _conn: connection object
    _videoid: the id of the video
    _langid: the id of the language to use for the subtitles
    _position: time in seconds to start the video
    Return: None
    """
    
    cursor = _conn.cursor() 
    query = """SELECT video_player, temp_directory
                        FROM ironrep.configuration 
                        LIMIT 1;"""

    cursor.execute(query)
    conf_table = cursor.fetchall()
    conf_df = pd.DataFrame(conf_table)
    conf_df.columns = [i[0] for i in cursor.description]

    query = """SELECT subtitles
                    FROM ironrep.subtitles
                WHERE videoid = %s
                    AND languageid = %s"""
    val = [int(_videoid), _langid]
    cursor.execute(query, val)    
    subt_table = cursor.fetchall()
    if (len(subt_table)>0):
        subt_df = pd.DataFrame(subt_table)
        subt_df.columns = [i[0] for i in cursor.description]

        with open(Path(list(conf_df['temp_directory'])[0]+"/play_subtitle.srt"), "w+") as f:
                f.write(list(subt_df['subtitles'])[0])

        query = """SELECT video_path
                    FROM ironrep.videos
                    WHERE id = %s"""
        val = [int(_videoid)]
        cursor.execute(query, val)
        video_table = cursor.fetchall()
        video_df = pd.DataFrame(video_table)
        video_df.columns = [i[0] for i in cursor.description]

        os.system(list(conf_df['video_player'])[0].replace('{videoparam}',list(video_df['video_path'])[0]).replace('{subtitleparam}',list(conf_df['temp_directory'])[0]+"/play_subtitle.srt").replace('{positionparam}',str(_position)))
    else:
         print(f'No subtitles found in language {_langid}')

In [45]:
video_player(connection, 52, 'en', 0)

[00005649fe77d550] main libvlc: Ejecutar vlc con la interfaz predeterminada. Use «cvlc» para usar vlc sin interfaz.
[00007f4edc002bf0] gl gl: Initialized libplacebo v4.192.1 (API v192)
libva info: VA-API version 1.16.0
libva error: vaGetDriverNameByIndex() failed with unknown libva error, driver_name = (null)
[00007f4edc002bf0] glconv_vaapi_x11 gl error: vaInitialize: unknown libva error
libva info: VA-API version 1.16.0
libva info: Trying to open /usr/lib/x86_64-linux-gnu/dri/iHD_drv_video.so
libva info: Found init function __vaDriverInit_1_14
libva error: /usr/lib/x86_64-linux-gnu/dri/iHD_drv_video.so init failed
libva info: va_openDriver() returns 1
libva info: Trying to open /usr/lib/x86_64-linux-gnu/dri/i965_drv_video.so
libva info: Found init function __vaDriverInit_1_10
libva info: va_openDriver() returns 0
[00007f4edc002bf0] gl gl: Initialized libplacebo v4.192.1 (API v192)
Failed to open VDPAU backend libvdpau_nvidia.so: no se puede abrir el archivo del objeto compartido: No e

In [None]:

transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/25. Week 9 - Day 1/recordings/virtual_environments_anaconda_tutorial on Vimeo.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/26. Week 9 - Day 2/recordings/GMT20221103-174403_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/27. Week 9 - Day 3/recordings/GMT20221105-095041_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/27. Week 9 - Day 3/recordings/GMT20221105-130243_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/28. Week 10 - Day 1/recordings/GMT20221108-174016_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/29. Week 10 - Day 2/recordings/GMT20221110-174206_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/30. Week 10 - Day 3/recordings/GMT20221112-095150_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/31. Week 11 - Day 1/recordings/GMT20221115-174256_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/32. Week 11 - Day 2/recordings/GMT20221117-174613_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/33. Week 11 - Day 3/recordings/GMT20221119-095525_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/33. Week 11 - Day 3/recordings/GMT20221119-130318_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/34. Week 12 - Day 1/recordings/GMT20221122-174530_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/35. Week 12 - Day 2/recordings/GMT20221124-174853_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/36. Week 12 - Day 3/recordings/GMT20221126-100149_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/39. Week 13 - Day 3/recordings/GMT20221203-100111_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/40. Week 14 - Day 1/recordings/GMT20221206-174541_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/41. Week 14 - Day 2/recordings/GMT20221208-174524_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/42. Week 14 - Day 3/recordings/GMT20220514-084037_Recording_2560x1440.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/43. Week 15 - Day 1/recordings/GMT20221213-173613_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/44. Week 15 - Day 2/recordings/GMT20221215-174748_Recording_1920x1080.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/45. Week 15 - Day 3/recordings/GMT20221217-100543_Recording_2560x1440.mp4')
transcribe(connection,'/home/roque/01. IronHack/00. Data Analytics/01. Course/45. Week 15 - Day 3/recordings/GMT20221222-173401_Recording_1920x1120.mp4')

translate_subtitles(connection, 1)

In [7]:
# Summary
# TODO: Try with a larger text

# python -m spacy download en_core_web_sm #eficency
# python -m spacy download en_core_web_trf #accuracy

import spacy
import en_core_web_trf
#import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS
import spacy_transformers
from string import punctuation
from heapq import nlargest

def summarize(text, per):
    #nlp = spacy.load('en_core_web_trf')
    nlp = en_core_web_trf.load()
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary

In [None]:
summarize(result['text'], 0.10)

In [10]:
+# Keywords

#RAKE
from rake_nltk import Rake
rake = Rake()
rake.extract_keywords_from_text(result['text'])
extracted_keyword = rake.get_ranked_phrases()
extracted_keyword

['like one main one main one main suggestion',
 'whole thing like drives like applies force',
 '30 grammets per hour speed limits',
 'insanely strong tech tech right behind',
 'pretty high cpc cost per click',
 'new race fund group b',
 'analysis like drives towards towards',
 'passengers involved involving car accident obviously',
 'strong hypothesis like super well laid',
 'getting like disparate data together like',
 'regular two bars one one next',
 'pretty intense machine learning technique',
 'specific works like vhvac shit',
 'thousand dollar investment per trade',
 'barcelona like 50 speed limit pro',
 'available like entire home apartment',
 'data set time seven parameters',
 'create something called budget persona',
 '46 per 46 values',
 'transcutaneous vagus nerve stimulation',
 'organic email marketing also influencer',
 'entire home apartment type cost',
 'think maybe 10 shapes something like',
 'normally find hypothesis test hypothesis find',
 'great like storytelling sto