### Imports

In [21]:
import pandas as pd
import pyodbc
import os
from dotenv import load_dotenv

load_dotenv()
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Connection String

In [2]:

driver = os.environ["DB_DRIVER"]
server = os.environ["DB_SERVER"]
database = os.environ["DB_DATABASE"]
username = os.environ["DB_USERNAME"]
password = os.environ["DB_PASSWORD"]

connection_str = f"Driver={driver};Server={server},1433;Database={database};Uid={username};Pwd={password};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"

# Load from csv

### Load df with transcripts from csv

In [10]:
videos_transcript = pd.read_csv("../../data/1405_data_with_clusters.csv")

In [11]:
videos_transcript.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,video_id,cluster,core_message,video_playcount,video_timestamp,english_transcript,german_transcript
0,0,0,7005858788343319813,23,Mach so bisschen Übergang deine Haare kürzer d...,613338,1631178624,,Mach so bisschen Übergang deine Haare kürzer d...
1,1,1,7040421798785125638,1;22;15,Kritik an ungeimpftem Personal in der Pflege;...,30603,1639225941,,nötigerweise Menschen sterben weil Ungeimpfte ...
2,2,2,7052592640344329477,13,7 Millionen sozialgeldempfänger davon gut die ...,27175,1642059688,,7 Millionen sozialgeldempfänger davon gut die ...
3,3,3,7069273153373097221,22;17;17,- Hohe Anzahl von Arztbesuchen aufgrund von Im...,235154,1645943422,,und zu den Nebenwirkungen das äh Nebenwirkunge...
4,4,4,7080134745832557830,16;1;0,Unterschied zwischen Gesundheitspolitik und p...,7439,1648472331,,sehr geehrte frau präsidentin kolleginnen und ...


In [None]:
def update_videos_in_db(update_df, connection_str):
    """
    Update existing entries in the Videos table based on video_id.
    Args:
        update_df: DataFrame with video_id, english_transcript, and german_transcript.
        connection_str: Connection string to the SQL database.
    """
    
    with pyodbc.connect(connection_str) as cnxn:
        cursor = cnxn.cursor()
        
        for _, row in update_df.iterrows():
            query = """
            UPDATE dbo.Videos
            SET transcript_en = ?, transcript_de = ?
            WHERE id = ?
            """
            data = (
                str(row['english_transcript']),
                str(row['german_transcript']),
                int(row['video_id'])
            )
            cursor.execute(query, data)
        
        cnxn.commit()
        print("Videos updated successfully")

Videos updated successfully


In [None]:
# Example usage
update_videos_in_db(videos_transcript, connection_str)

# Get videos without transcript in db and add transcripts from api requests without speech2text

In [22]:
from reclaim_tiktok.transcriber.db_connector import DBConnector
from reclaim_tiktok.transcriber.tiktok_video_details import TiktokVideoDetails


In [23]:
db_connector = DBConnector()

## Get videos without transcript in db
returns pyodbc.rows

In [91]:
rows = db_connector.get_urls_without_transcription()
#rows = db_connector.get_urls_with_transcription()

In [92]:
print(len(rows))

9603


In [93]:
import os
import time

import numpy as np
import pandas as pd

from reclaim_tiktok.transcriber.tiktok_video_details import (
    HTTPRequestError,
    RequestReturnedNoneError,
    TiktokVideoDetails,
    VideoIsPrivateError,
)


class StatCollector:
    """Provides an easy method of collecting and printing statistics
    for tiktok video data collection
    """

    def __init__(self) -> None:
        self.start_time = time.time()
        self.successes = 0
        self.private_videos = []
        self.failed_requests = []

    def add_success(self) -> None:
        """Adds 1 to the success counter"""
        self.successes += 1

    def add_private_video(self, url: str) -> None:
        """Appends the ``url`` to the list of private videos to be
        returned when ``print_stats()`` is called.

        Params
        ---
        :param url: A string representing the url that links to a private video
        """
        self.private_videos.append(url)

    def add_failed_request(self, url: str) -> None:
        """Appends the ``url`` to the list of failed requests to be
        returned when ``print_stats()`` is called.

        Params
        ---
        :param url: A string representing the url that links to a failed video
        """
        self.failed_requests.append(url)

    def print_stats(self) -> None:
        """Prints the collected statistics

        Prints:
        - the list of collected private videos
        - the list of failed requests
        - Total successes
        - Total Private
        - Total Failed
        - Total elapsed time in H M S.
        """
        end_time = time.time()
        print("\n")
        print("Private: \n\t", "\n\t".join(self.private_videos))
        print("Failed: \n\t", "\n\t".join(self.failed_requests))
        print("Successes: ", self.successes)
        print("Private: ", len(self.private_videos))
        print("Failed: ", len(self.failed_requests))
        total_time = end_time - self.start_time
        hours = total_time // 3600
        minutes = (total_time % 3600) // 60
        seconds = total_time - 3600 * hours - 60 * minutes
        print("Total elapsed time: %dh %dm %.2fs" % (hours, minutes, seconds))


def print_progress_bar(percentage: float, bar_length: int = 20) -> None:
    """Prints a simple progress bar based on an updated percentage

    Params
    ---
    :param percentage: The percentage to be displayed
    :param bar_length: The desired length of the bar, defaulted to 20 ``'='``
    """
    normalizer = int(100 / bar_length)
    progress = "\r[%s%s] %.2f%%" % (
        "=" * int(percentage / normalizer),
        " " * int(bar_length - percentage / normalizer),
        percentage,
    )
    print(progress, end="", flush=True)



## Retrieve transcripts available via API

In [41]:
# loop through the rows and get the transcription for each video 
for row in rows[:10]:
    video_details = TiktokVideoDetails(url=row[12])
    transcripts = video_details.get_transcriptions(disable_azure=True)
    if transcripts:
        #db_connector.update_transcript(video_id=video_details.video_id, transcript_en=transcripts.get("eng-US"), transcript_de=transcripts.get("deu-DE"))

{'deu-DE': 'Mach so bisschen Übergang deine Haare kürzer dann siehst du automatisch bisschen frischer Aus in der umweltschutzpartei die sind wie man weiß nicht der heiße Scheiß vorbei in der umweltschutzbank in der umweltschutzpartei Merkel ihr seid überall auf dem Sofa wer hat versteckt Robert Habeck wird nicht Spitzenkandidat der grünen autolobby ist n Grätsche kriecht gern in den Arsch vom zettsche er hat tierisch Bock aufs Maul doch alle drehen am Rad denn der grenzblock AB dann sagt er nein ja die umweltschutzpartei nein nein scheiße scheiße scheiße scheiße '}
{'deu-DE': 'nötigerweise Menschen sterben weil Ungeimpfte dort gearbeitet haben also ganz ehrlich so langsam schwillt Mir richtig der Kamm nicht nur dass man so 1 Hampelmann zum Gesundheitsminister gemacht hat der von Anbeginn der Pandemie nur gehetzt hat Scheiße erzählt hat selber nicht mal wer weiß was er überhaupt für Müll verzapft und und und so warte mal ähm Punkt 1 seit Jahren und das nicht erst seit Beginn der Pandemi

In [100]:

def batch_add_transcripts_to_db(rows: list) -> None:
    """
    Gets a list of rows from the database with entries without transcript.
    It retrieves the transcripts and adds the transcripts to the database.
    Also collects and prints the statistics of the run.

    Args:
        list[pyodbc.row]: A list of rows from the database with entries without transcript.
    """


    total_rows = len(rows)
    errors = {}
    stats = StatCollector()

    try:
        index = 0
        for row in rows:
            completion_percentage = (index / total_rows) * 100
            print_progress_bar(completion_percentage)
            url = row[12]
            index += 1
            try:
                tt_obj = TiktokVideoDetails(url=url)
            except VideoIsPrivateError as error:
                stats.add_private_video(url)
                print("\n", error)
                db_connector.update_transcript(video_id=row[0], transcript_en=None, transcript_de=None, no_transcript_reason=str(error))
                continue
            except (RequestReturnedNoneError, HTTPRequestError) as error:
                stats.add_failed_request(url)
                print("\n", error)
                db_connector.update_transcript(video_id=row[0], transcript_en=None, transcript_de=None, no_transcript_reason=str(error))
                continue
            except Exception as error:
                stats.add_failed_request(url)
                print("\nUnexpected Exception occured:", error)
                db_connector.update_transcript(video_id=row[0], transcript_en=None, transcript_de=None, no_transcript_reason=str(error))
                continue

            try:
                transcriptions = tt_obj.get_transcriptions(disable_azure=True)
                if transcriptions:
                    db_connector.update_transcript(video_id=row[0], transcript_en=transcripts.get("eng-US"), transcript_de=transcripts.get("deu-DE"), no_transcript_reason=None)
                    stats.add_success()
                else:
                    pass
                    #errors[index] = "No transcription provided by Tiktok"
      
            except Exception as error:
                print("\n", error)
                transcriptions = {}
                #errors[index] = error
            

    except KeyboardInterrupt:
        print("\nKeyboard Interrupt detected. Stopping...")
    except Exception as error:
        print("\nUnexpected Exception occurred:", error)
    finally:
        stats.print_stats()


In [102]:
# Example usage
batch_add_transcripts_to_db(rows[1000:])

[                   ] 0.70%%