<h2>Loading OpenAI Transcripts and Storing in Data Frame</h2>

In [7]:
# get transcripts from openAI_transcript_with_id folder, 
# store it in dataFrame, and save it to CSV
import os
import pandas as pd

def get_list_of_files(folder_path='.'):
    """Get a list of files in a folder."""
    return os.listdir(folder_path)

def get_file_extension(file_name):
    """Get the file extension of a file."""
    return os.path.splitext(file_name)[1]

def read_file_content(file_path):
    """Read the content of a file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

excel_file = '../../../Videos_and_DISCERN_data/videos_info_and_scores.xlsx'
videos_info_file = '../videos_info.csv'

topics_df = pd.read_excel(excel_file, usecols=['Video ID', 'Topic'], encoding='utf-8')

df_openAI_transcipts = pd.read_csv(videos_info_file, encoding='utf-8')
df_openAI_transcipts.index += 1

# Merge df_openAI_transcipts with topics_df based on 'Video ID' to add 'Topic' information
df_openAI_transcipts = df_openAI_transcipts.merge(topics_df[['Video ID', 'Topic']], on='Video ID', how='left')

# Initialize 'Transcript' and 'Description' columns with None
df_openAI_transcipts['Transcript'] = None
df_openAI_transcipts['Description'] = None

# Reorder columns such that 'Topic' appears before 'Transcript'
new_columns = ['Topic', 'Transcript', 'Description']
existing_columns = [col for col in df_openAI_transcipts.columns if col not in new_columns] 
df_openAI_transcipts = df_openAI_transcipts[existing_columns + new_columns]

folder_path = './corrected'
files_in_folder = get_list_of_files(folder_path)

for video_id in df_openAI_transcipts['Video ID'].values:
    video_files = [file for file in files_in_folder if video_id in file]
    
    if video_files is None:
        print("No files were found for Video ID:", video_id)
    
    else:
        video_idx = df_openAI_transcipts.loc[df_openAI_transcipts['Video ID'] == video_id].index[0]
        for video_file in video_files:
            file_content = read_file_content(os.path.join(folder_path, video_file))
            file_extension = get_file_extension(video_file)
            if file_extension == '.txt':
                df_openAI_transcipts.loc[video_idx, 'Transcript'] = file_content
            elif file_extension == '.description':
                df_openAI_transcipts.loc[video_idx, 'Description'] = file_content

df_openAI_transcipts.head(3)


Unnamed: 0,Number,URL,Video Title,Video ID,Date Published,Duration,Duration (sec),Topic,Transcript,Description
0,1,https://www.youtube.com/watch?v=HnkVcClkw2w,Nocturnal Enuresis (Bedwetting),HnkVcClkw2w,08-12-2017,19:46,1186,Nocturnal Enuresis,"Hello, my name is Usama Naga. I will discuss i...",Clinical approach to a child with Enuresis\n\n...
1,2,https://www.youtube.com/watch?v=OWtlKJbg_Pc,Nocturnal Enuresis,OWtlKJbg_Pc,13-05-2015,8:16,496,Nocturnal Enuresis,Nocturnal Enuresis. And this is essentially de...,Nocturnal Enuresis\nInstructional Tutorial Vid...
2,3,https://www.youtube.com/watch?v=95T3ABnnhOs,"Nocturnal Enuresis: BedWetting, What to Know",95T3ABnnhOs,08-03-2010,4:58,298,Nocturnal Enuresis,"Even after toilet training, it's not uncommon ...",“Nocturnal Enuresis” may sound like a serious ...


In [None]:
df_openAI_transcipts.to_csv("videos_info_with_OpenAI_transcripts.csv", index=False)

#### Check for duplicated Video IDs

In [8]:
# Indices of videos with duplicated Vidoe ID

mask = df_openAI_transcipts.duplicated(subset=['Video ID'], keep=False)

# Filter DataFrame to include only rows with duplicate 'Video ID' values
indices_equal_video_id = df_openAI_transcipts[mask].index.tolist()

print("Indices of rows with duplicated 'Video ID' values in df:", indices_equal_video_id)
print("Count: ", len(indices_equal_video_id))


Indices of rows with duplicated 'Video ID' values in df: []
Count:  0


<h2>Getting IDs of Unpleasant Videos</h2>

#### Non-Audible videos

In [11]:
# Getting IDs of videos without speech (music only)
no_speech_ids = """U7Ulc4_OI5Q
l0LxEeEJDZU
WxD_o4kYYII
0XCsN7GvZEo
MQhpmdfcJoY
aaRpk5MxRt4
cdToNKruJV4
8u-eqWxvCJA
ACtM5trHH5Q
mhSszRIz2Nk
ikSMIl1_TQM
jGpkru3UU1k
3AN_8XzN-KU
cZJopUlzBeA
Hh5gcqZHhgg
4HdZwm0zmn0
2gYVWv9Zhxk
LAf_QFmTPkw
fNouKvGCqIM
raCnH6BPdKM
Zw88nYSAT_M
wGmh5LZWmq0
8CgYrgVc0C4
eJFLwYZO7iE
_X_XCBcY17A
vqXQ_coJy4I
P7372tMFOJg
v9FMaozTgTM
RE76s2oXFE0
ohJZw4IWk_0
LeKy4SipyMI
FgBhFVyTyXE
aCtE-Va_6NI
9pBCNir7jfw
Qp5pAixKKhs
NEEjA13t5SM
6iNYB4bhlVw
Fu2ugbb3wnQ
4qy_aEy_Qdw
vWneXOZai68
pCa8jOQJn-Y
vokDVp5uyu4
Q6_0_CWcupM
W0pLY4fSwb4
QvG8oK4RJD0
wRXnzhbhpHU
glBmSQRxaIg
KO6535-Qax8
wjWnV4broEk
glBmSQRxaIg
UzpcPeoPnW0
9Pd46JhQRCk
9XMyL8YZ9jU
iE0_0Ponqok
ElHuyQefPeI
GtwtGeHH6bE
c2tyic4JHBU
au_jIGyPSnE
8HC0p9p1tLo
bodiJCJq5wk
fOUX-iFzfo4
s8WTgH7Svjk
EmE7zlF5dWQ
ftcaXVSVpVY
ggjTeJiqdn0
-q4jZ-_j6Ag
a1DwUL091eI
xIdih25tfu0
B5eUjXaBsr8
xAg2C9EY-Qs
KVqZXoOnaeU
rybJDxGT0a0
XCMs2vyxbvo
kowVW2QOWr0
5GrGVfcDnXk
i9vTDi3Z8HU
Dp1n_n9OLWI
X35iTySMV7I
zojoRNBXNsk
X35iTySMV7I
"""
no_speech_ids= no_speech_ids.strip().split('\n')

print("no_speech_ids: ", no_speech_ids)
print("Count: ", len(no_speech_ids))

no_speech_ids:  ['U7Ulc4_OI5Q', 'l0LxEeEJDZU', 'WxD_o4kYYII', '0XCsN7GvZEo', 'MQhpmdfcJoY', 'aaRpk5MxRt4', 'cdToNKruJV4', '8u-eqWxvCJA', 'ACtM5trHH5Q', 'mhSszRIz2Nk', 'ikSMIl1_TQM', 'jGpkru3UU1k', '3AN_8XzN-KU', 'cZJopUlzBeA', 'Hh5gcqZHhgg', '4HdZwm0zmn0', '2gYVWv9Zhxk', 'LAf_QFmTPkw', 'fNouKvGCqIM', 'raCnH6BPdKM', 'Zw88nYSAT_M', 'wGmh5LZWmq0', '8CgYrgVc0C4', 'eJFLwYZO7iE', '_X_XCBcY17A', 'vqXQ_coJy4I', 'P7372tMFOJg', 'v9FMaozTgTM', 'RE76s2oXFE0', 'ohJZw4IWk_0', 'LeKy4SipyMI', 'FgBhFVyTyXE', 'aCtE-Va_6NI', '9pBCNir7jfw', 'Qp5pAixKKhs', 'NEEjA13t5SM', '6iNYB4bhlVw', 'Fu2ugbb3wnQ', '4qy_aEy_Qdw', 'vWneXOZai68', 'pCa8jOQJn-Y', 'vokDVp5uyu4', 'Q6_0_CWcupM', 'W0pLY4fSwb4', 'QvG8oK4RJD0', 'wRXnzhbhpHU', 'glBmSQRxaIg', 'KO6535-Qax8', 'wjWnV4broEk', 'glBmSQRxaIg', 'UzpcPeoPnW0', '9Pd46JhQRCk', '9XMyL8YZ9jU', 'iE0_0Ponqok', 'ElHuyQefPeI', 'GtwtGeHH6bE', 'c2tyic4JHBU', 'au_jIGyPSnE', '8HC0p9p1tLo', 'bodiJCJq5wk', 'fOUX-iFzfo4', 's8WTgH7Svjk', 'EmE7zlF5dWQ', 'ftcaXVSVpVY', 'ggjTeJiqdn0', '-q4jZ-_

In [12]:
# Detect if No transcript (either no_speech or nonenglish)

None_videos_ids = []
for idx, row in df_openAI_transcipts.iterrows():
    if row['Transcript'] is None:
        None_videos_ids.append(row['Video ID'])

print("None_videos_ids: ", None_videos_ids)
print("Count: ", len(None_videos_ids))

None_videos_ids:  ['U7Ulc4_OI5Q', 'l0LxEeEJDZU', '-Ct0iWSEkdc', 'WxD_o4kYYII', '0XCsN7GvZEo', '73cM2Rh0RLk', 'MQhpmdfcJoY', 'aaRpk5MxRt4', 'N5KpDWVT4e0', 'cdToNKruJV4', '8u-eqWxvCJA', 'iZOUAP7BiAg', 'KVqZXoOnaeU', 'OpsJghXtNHo', 'mhSszRIz2Nk', 'dy9ZqVuP9V8', 'ikSMIl1_TQM', 'Jv1lG417LKg', '6VkBDpFpvI8', 'jGpkru3UU1k', '3AN_8XzN-KU', 'cZJopUlzBeA', 'aZmbUyKEfr4', 'd2vW_CHWqo4', 'Hh5gcqZHhgg', '4HdZwm0zmn0', '2gYVWv9Zhxk', 'LAf_QFmTPkw', 'fNouKvGCqIM', 'raCnH6BPdKM', 'Zw88nYSAT_M', 'wGmh5LZWmq0', '8CgYrgVc0C4', 'eJFLwYZO7iE', '_X_XCBcY17A', 'vqXQ_coJy4I', 'P7372tMFOJg', 'v9FMaozTgTM', 'RE76s2oXFE0', 'ohJZw4IWk_0', 'LeKy4SipyMI', 'FgBhFVyTyXE', 'aCtE-Va_6NI', '9pBCNir7jfw', 'Qp5pAixKKhs', 'NEEjA13t5SM', '6iNYB4bhlVw', 'Fu2ugbb3wnQ', '4qy_aEy_Qdw', 'vWneXOZai68', 'pCa8jOQJn-Y', 'vokDVp5uyu4', 'Q6_0_CWcupM', 'W0pLY4fSwb4', 'QvG8oK4RJD0', 'wRXnzhbhpHU', 'glBmSQRxaIg', 'KO6535-Qax8', 'wjWnV4broEk', 'UzpcPeoPnW0', '9Pd46JhQRCk', '9XMyL8YZ9jU', 'iE0_0Ponqok', 'ElHuyQefPeI', 'GtwtGeHH6bE', 'c2tyi

#### Non-English videos

In [10]:
# Detect if a transcript is not in English
from langdetect import detect

nonenglish_videos_ids = []
for idx, row in df_openAI_transcipts.iterrows():
    if row['Transcript'] is not None and detect(row['Transcript']) != 'en':
        print(row['Transcript'])
        nonenglish_videos_ids.append(row['Video ID'])

print("nonenglish_videos_ids: ", nonenglish_videos_ids)
print("Count: ", len(nonenglish_videos_ids))

nonenglish_videos_ids:  []
Count:  0


#### Long and very short videos

In [13]:
# getting IDs of long or very short videos
from transformers import AutoTokenizer

# Define the instruction and model
instruction = 'Analyze the following transcript of a YouTube video according to this question: Is the video clear what sources of information were used to compile the publication (other than the author)? Return explicitly an integer score from 1 to 5, where 1 is no, 2 to 4 is partially, 5 is yes. Then explain your choice.'
model_id = "jarradh/llama2_70b_chat_uncensored"
tokenizer = AutoTokenizer.from_pretrained(model_id, legacy=False, trust_remote_code=True)

# List to store video IDs of long transcripts
long_videos_ids = []
short_videos_ids= []

# Maximum token limit for the transcript
max_tokens_limit = 2000
min_tokens_limit = 150
# Iterate over each video transcript
for index, row in df_openAI_transcipts.iterrows():
    video_id = row['Video ID']
    transcript = row['Transcript']
    if transcript is not None:
        # Construct the prompt with the instruction and transcript
        prompt = f"Instruction: {instruction}\nTranscript: {transcript}\nScore: "

        # Tokenize the prompt
        tokens = tokenizer(prompt, return_length=True, return_tensors="pt")

        # Check if the token length exceeds the limit
        if tokens['length'] > max_tokens_limit:
            long_videos_ids.append(video_id)
        if tokens['length'] < min_tokens_limit:
            short_videos_ids.append(video_id)
        

print(f"long_videos_ids with tokens more than {max_tokens_limit}: ", long_videos_ids)
print("Count: ", len(long_videos_ids))
print(f"short_videos_ids with tokens less than {min_tokens_limit}: ", short_videos_ids)
print("Count: ", len(short_videos_ids))


long_videos_ids with tokens more than 2000:  ['HnkVcClkw2w', 'sIP3nMZSqZk', '0KCyxyYAOzk', '3Lv8c6CrVU0', 'TW3IkEo-dzU', 'khoUWFDCRVM', 'YjCo7qIbvrQ', 'sx03pp7qa7Q', 'fym_v7iQFDw', 'O5pBkSxlvrw', 'eFz78nDtXLE', 'r1hxxFjLXkk', 'GSL_BCrBx48', 'WN4nYdRh23M', 'l6eOE0eApNo', 'Um24ajmB2is', 'x0V5ffsGxIg', 'SwVcIz-mtxA', 'ScC2ozRwio0', 'Aq85mzTab-g', 'jsD3HC617KY', 'tLsapCY9wOM', 'D7f7lmrTWUQ', 'wRMcxRHr7Go', 'bOODigTjQ20', 'MICPIzv8ESo', 'U2BFeod_JMk', 'DxLlR3MXmqM', '5O_JMpwN8kc', 'MWpyQee_OWg', 'HXlKVVoVnpU', 'y6b4GeYY9sg', '4bOgmhb79X0', 'MaVAsuwYacE', '0eVu6GbL32U', '6xakdapRfRQ', 'ZLtAUy6GhNs', '01Udfnd84eo', 'l639vX0v57A', 'TAY1sYZ-2U0', 'CPqqOJOr0gc', 'YUnZNU99bAA', 'O_T6_xHTph0', 'JMfmDAJo3qc', 'WPxeI6R8AXY', '-w8nRpnBAlE', 'Ir2wro32sOs', 'IrUI3ekMo_U', 'FNonSMghN40', 'T7WrigyfBd8', 'MUOIedSaIUw', 'O8q72r2x5aU', '5vSYlv5a0k0', 'yb344xfIyxI', 'DvDaN3CSAUQ', 'lRxCRVb2IC0', 'bXGFI5s0bOY', 'dvklGFi-WD4', 'kUjc3UUgKa0', 'mQe51ov0XmA', '3VdaGyHHrXw', 'mvanzhgh5RQ', '3A7qIM6qknw', 'rWbuftKL

#### Combined IDs to delete

In [14]:
# IDs to delete
combined_ids_to_delete = set(no_speech_ids + None_videos_ids + nonenglish_videos_ids + long_videos_ids + short_videos_ids )

print("combined_ids_to_delete: ", combined_ids_to_delete)
print("Count: ", len(combined_ids_to_delete))

combined_ids_to_delete:  {'CC_j6Ki-T7w', '-q4jZ-_j6Ag', '5vSYlv5a0k0', 'kUjc3UUgKa0', 'fym_v7iQFDw', '-Ct0iWSEkdc', 'Oiar5UM1TwE', '3Lv8c6CrVU0', '43Y8lpnhWus', 'KVqZXoOnaeU', 'ZLtAUy6GhNs', 'cdToNKruJV4', 'P7372tMFOJg', 'fsLI12i5fvs', 'mQXmf_uo1yE', 'r1hxxFjLXkk', 'vokDVp5uyu4', 'Aq85mzTab-g', 'i9vTDi3Z8HU', 'GSL_BCrBx48', 'Q6_0_CWcupM', 'Um24ajmB2is', '9Pd46JhQRCk', 'U7Ulc4_OI5Q', 'EmE7zlF5dWQ', '3VdaGyHHrXw', 'jsD3HC617KY', 'mQe51ov0XmA', '_X_XCBcY17A', 'TK2hKjvBVmA', 'USxPGAn1RLs', '73cM2Rh0RLk', 'yb344xfIyxI', '6xakdapRfRQ', 'O_T6_xHTph0', 'aZmbUyKEfr4', 'au_jIGyPSnE', 'dy9ZqVuP9V8', 'CPqqOJOr0gc', 'NnqAkM9r2a8', 'wGmh5LZWmq0', '3A7qIM6qknw', 'YUnZNU99bAA', 'ElHuyQefPeI', 'DxLlR3MXmqM', 'TAY1sYZ-2U0', 'vMVrHIPHi6M', 'MICPIzv8ESo', '8CgYrgVc0C4', 'cZJopUlzBeA', 'HnkVcClkw2w', 'bOODigTjQ20', '4bOgmhb79X0', 'wRXnzhbhpHU', 'c2tyic4JHBU', 'eFz78nDtXLE', '5GrGVfcDnXk', 'tLsapCY9wOM', 'RVvB1PZJYpM', 'Gi3-L6HU3hw', 'B5eUjXaBsr8', 'JrfHKVlpb1A', 'YcsdjBrXylE', 'Dp1n_n9OLWI', 'bodiJCJq5wk',

<h2>Removing Unpleasant Videos</h2>

In [32]:
df_openAI_updated = df_openAI_transcipts[~df_openAI_transcipts['Video ID'].isin(combined_ids_to_delete)]
df_openAI_updated.reset_index(drop=True, inplace=True)
df_openAI_updated.index += 1
df_openAI_updated.loc[:, 'Number'] = df_openAI_updated.index

df_openAI_updated.tail(2)

Unnamed: 0,Index,URL,Video Title,Video ID,Date Published,Duration,Duration (sec),Topic,Transcript,Description
275,275,https://www.youtube.com/watch?v=ypjJ9yQ-5wI,Innovative Treatment for Pudendal Neuralgia by...,ypjJ9yQ-5wI,06-01-2020,1:37,97,Pudendal Nerve,My previous background as a pelvic surgeon has...,Dr. Todd Malan explains how he uses adipose-de...
276,276,https://www.youtube.com/watch?v=O3sS1HUFp4I,Acupuncture Used To Treat Pelvic Pain,O3sS1HUFp4I,15-07-2015,1:46,106,Pudendal Nerve,When it comes to using acupuncture to treat pa...,When it comes to using acupuncture to treat pa...


In [None]:
df_openAI_updated.to_csv("videos_info_with_OpenAI_transcripts_filtered.csv", index=False)