### Imports

In [21]:
import pandas as pd
import pyodbc
import os
from dotenv import load_dotenv

load_dotenv()
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Connection String

In [2]:

driver = os.environ["DB_DRIVER"]
server = os.environ["DB_SERVER"]
database = os.environ["DB_DATABASE"]
username = os.environ["DB_USERNAME"]
password = os.environ["DB_PASSWORD"]

connection_str = f"Driver={driver};Server={server},1433;Database={database};Uid={username};Pwd={password};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"

# Load from csv

### Load df with transcripts from csv

In [10]:
videos_transcript = pd.read_csv("../../data/1405_data_with_clusters.csv")

In [11]:
videos_transcript.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,video_id,cluster,core_message,video_playcount,video_timestamp,english_transcript,german_transcript
0,0,0,7005858788343319813,23,Mach so bisschen Übergang deine Haare kürzer d...,613338,1631178624,,Mach so bisschen Übergang deine Haare kürzer d...
1,1,1,7040421798785125638,1;22;15,Kritik an ungeimpftem Personal in der Pflege;...,30603,1639225941,,nötigerweise Menschen sterben weil Ungeimpfte ...
2,2,2,7052592640344329477,13,7 Millionen sozialgeldempfänger davon gut die ...,27175,1642059688,,7 Millionen sozialgeldempfänger davon gut die ...
3,3,3,7069273153373097221,22;17;17,- Hohe Anzahl von Arztbesuchen aufgrund von Im...,235154,1645943422,,und zu den Nebenwirkungen das äh Nebenwirkunge...
4,4,4,7080134745832557830,16;1;0,Unterschied zwischen Gesundheitspolitik und p...,7439,1648472331,,sehr geehrte frau präsidentin kolleginnen und ...


In [None]:
def update_videos_in_db(update_df, connection_str):
    """
    Update existing entries in the Videos table based on video_id.
    Args:
        update_df: DataFrame with video_id, english_transcript, and german_transcript.
        connection_str: Connection string to the SQL database.
    """
    
    with pyodbc.connect(connection_str) as cnxn:
        cursor = cnxn.cursor()
        
        for _, row in update_df.iterrows():
            query = """
            UPDATE dbo.Videos
            SET transcript_en = ?, transcript_de = ?
            WHERE id = ?
            """
            data = (
                str(row['english_transcript']),
                str(row['german_transcript']),
                int(row['video_id'])
            )
            cursor.execute(query, data)
        
        cnxn.commit()
        print("Videos updated successfully")

Videos updated successfully


In [None]:
# Example usage
update_videos_in_db(videos_transcript, connection_str)

# Get videos without transcript in db and add transcripts from api requests without speech2text

In [None]:
from reclaim_tiktok.transcriber.db_connector import DBConnector
from reclaim_tiktok.transcriber.tiktok_video_details import TiktokVideoDetails
from reclaim_tiktok.transcriber.main_transcriber import StatCollector
from reclaim_tiktok.transcriber.main_transcriber import print_progress_bar


In [None]:
db_connector = DBConnector()

## Get videos without transcript in db
returns pyodbc.rows

In [None]:
rows = db_connector.get_urls_without_transcription()
#rows = db_connector.get_urls_with_transcription()

In [None]:
print(len(rows))

9603


## Retrieve transcripts available via API and add if successful to db

In [None]:
db_connector.update_transcript_multiple(rows=rows)

In [None]:

def batch_add_transcripts_to_db(rows: list) -> None:
    """
    Gets a list of rows from the database with entries without transcript.
    It retrieves the transcripts and adds the transcripts to the database.
    Also collects and prints the statistics of the run.

    Args:
        list[pyodbc.row]: A list of rows from the database with entries without transcript.
    """


    total_rows = len(rows)
    errors = {}
    stats = StatCollector()

    try:
        index = 0
        for row in rows:
            completion_percentage = (index / total_rows) * 100
            print_progress_bar(completion_percentage)
            url = row[12]
            index += 1
            try:
                tt_obj = TiktokVideoDetails(url=url)
            except VideoIsPrivateError as error:
                stats.add_private_video(url)
                print("\n", error)
                db_connector.update_transcript(video_id=row[0], transcript_en=None, transcript_de=None, no_transcript_reason=str(error))
                continue
            except (RequestReturnedNoneError, HTTPRequestError) as error:
                stats.add_failed_request(url)
                print("\n", error)
                db_connector.update_transcript(video_id=row[0], transcript_en=None, transcript_de=None, no_transcript_reason=str(error))
                continue
            except Exception as error:
                stats.add_failed_request(url)
                print("\nUnexpected Exception occured:", error)
                db_connector.update_transcript(video_id=row[0], transcript_en=None, transcript_de=None, no_transcript_reason=str(error))
                continue

            try:
                transcriptions = tt_obj.get_transcriptions(disable_azure=True)
                if transcriptions:
                    db_connector.update_transcript(video_id=row[0], transcript_en=transcriptions.get("eng-US"), transcript_de=transcriptions.get("deu-DE"), no_transcript_reason=None)
                    stats.add_success()
                else:
                    pass
                    #errors[index] = "No transcription provided by Tiktok"
      
            except Exception as error:
                print("\n", error)
                transcriptions = {}
                #errors[index] = error
            

    except KeyboardInterrupt:
        print("\nKeyboard Interrupt detected. Stopping...")
    except Exception as error:
        print("\nUnexpected Exception occurred:", error)
    finally:
        stats.print_stats()


In [None]:
# Example usage
batch_add_transcripts_to_db(rows[1000:])

[                   ] 0.77%%
 
Video details could not be parsed. Video is private or has been removed.
[                   ] 0.81%
 
Video details could not be parsed. Video is private or has been removed.
[                   ] 1.94%
 
Video details could not be parsed. Video is private or has been removed.
[                   ] 4.13%