In [1]:
import pandas as pd
import numpy as np
import glob
from ast import literal_eval
from datetime import datetime
import pickle

pd.set_option("display.max_columns", 20)
pd.set_option("display.max_rows", 2000)

%load_ext autoreload
%autoreload 2

### All the functions (Need to put these into a separate .py file)

In [2]:
def read_and_merge(path):
    '''
    Input: Path of directory where all the csv files are located
    Output: Merged dataframe of all csv files in the directory
    '''
    ## Finds all the csv files in a directory and put them into a list
    df_list = []
    for file in glob.glob(path):
        df = pd.read_csv(str(file))
        df_list.append(df)
        
    ## Merges all dataframes in the csv files
    df_merged = df_list[0]
    for df in df_list[1:]:
        df_merged = pd.concat([df_merged, df])
        
    return df_merged

In [3]:
def df_preprocessing_pipeline(df):
    '''
    Input: Datafrome to be preprocessed
    Output: Preprocessed dataframe
    '''
    ## Drop rows with NaNs in the 'Transcript' column
    df = df[df['Transcript'].notna()]
    
    ## Drop duplicate videos
    df = df.drop_duplicates(subset=['Video_ID'])
    
    ## Drop videos with duration < 1 minute
    df = df[(df['Duration'].str.len() >=5) | (df['Duration'].str[-4]!='0')]
    
    return df
    

In [4]:
def df_preprocessing_pipeline2(df):
    '''
    Input: Datafrome to be preprocessed
    Output: Preprocessed dataframe
    ''' 
    ## Drop videos without clear upload date
    rows_to_drop = df[(df['Upload Date'].str.contains('hours ago')==True)].index
    df = df.drop(rows_to_drop).reset_index(drop=True)
    
    ## Change upload date string to datetime format
    df['Upload Date'] = df['Upload Date'].apply(lambda x: x.replace(',',''))
    df['Upload Date'] = df['Upload Date'].apply(lambda x: x.replace('Premiered ',''))
    df['Upload Date'] = df['Upload Date'].apply(lambda x: x.replace('Streamed live on ',''))
    
    df['Upload Date'] = df['Upload Date'].apply(lambda x: datetime.strptime(x, '%b %d %Y').date())
    
    ## Change duration string to minutes format
    
    def convert_to_minutes(duration):
        if len(duration) >= 7:
            time = (int(duration.split(':')[0])*60) + (int(duration.split(':')[1])) + (int(duration.split(':')[2])/60)
        else:
            time = (int(duration.split(':')[0])) + (int(duration.split(':')[1])/60)
            
        return time
    
    df['Duration'] = df['Duration'].apply(convert_to_minutes)

    return df

In [5]:
def grab_transcript_text(df):
    '''
    Input: Dataframe with raw values in the Transcript column
    Output: Dataframe with only text in the Transcript column, in a string format
    '''
    ## Create a function for grabbing the text values and putting into a single string
    def transcript_text_string(transcript_dict_list):
        text_list = []
        for dict_text in literal_eval(transcript_dict_list):
            text = dict_text['text']
            text_list.append(text)
            
        text_string = ' '.join(text_list)
            
        return text_string
    
    df['Transcript'] = df['Transcript'].apply(transcript_text_string)
            
    return df   

In [6]:
def pickle_df(df_str):
    '''
    Input: Name of a dataframe in a string format
    Output: Pickle the dataframe into the Data folder
    '''  
    with open('../Data/'+ df_str +'.pickle', 'wb') as f_video_data:
        pickle.dump(eval(df_str), f_video_data)

### Read in the CSVs and merge them

In [7]:
df_videos_raw = read_and_merge('/Users/mike/Desktop/GitHub Repositories/project5_youtube_recommender/Data/*.csv')

### Preprocess the dataframe to remove rows with NaNs, duplicate vidoes, videos with duration < 1 minute, and irrelevant videos not related to investing

In [8]:
df_videos_cleaned_v1 = df_preprocessing_pipeline(df_videos_raw).reset_index(drop=True)

In [9]:
## Drop irrelevant videos not related to stock investing
rows_to_drop = df_videos_cleaned_v1[(df_videos_cleaned_v1['Title'].str.contains('Forex')==True) |
                            (df_videos_cleaned_v1['Title'].str.contains('Crypto')==True) |
                            (df_videos_cleaned_v1['Title'].str.contains('Brand Moat')==True) |
                            (df_videos_cleaned_v1['Title'].str.contains('Richard Moat')==True) |
                            (df_videos_cleaned_v1['Title'].str.contains('Clips From The Moat')==True) |
                            (df_videos_cleaned_v1['Title'].str.contains('Geraghty-Moats')==True) |
                            (df_videos_cleaned_v1['Title'].str.contains('Bitcoin')==True) |
                            (df_videos_cleaned_v1['Video_ID']=='gx6JOPTWznQ') | 
                            (df_videos_cleaned_v1['Video_ID']=='drGxSXK8Jwc') | 
                            (df_videos_cleaned_v1['Video_ID']=='u7KkCSpuBLo') |
                            (df_videos_cleaned_v1['Video_ID']=='wM7uNy_ColA') |
                            (df_videos_cleaned_v1['Video_ID']=='OEUWbKSXYD0') |
                            (df_videos_cleaned_v1['Video_ID']=='UDRgo-FLGh0') |
                            (df_videos_cleaned_v1['Video_ID']=='571EoMTCUSw') |
                            (df_videos_cleaned_v1['Video_ID']=='CV39QzFpJx4') |
                            (df_videos_cleaned_v1['Video_ID']=='Mak8d_JSgyU') |
                            (df_videos_cleaned_v1['Video_ID']=='YUX3Y8ONuB4') |
                            (df_videos_cleaned_v1['Video_ID']=='WyVnrJq46Iw') |
                            (df_videos_cleaned_v1['Video_ID']=='I4AXdn1w15c') |
                            (df_videos_cleaned_v1['Video_ID']=='hnISD7EKV-A') |
                            (df_videos_cleaned_v1['Video_ID']=='PdTBLPL2yos') |
                            (df_videos_cleaned_v1['Video_ID']=='YFNjyIcnJSw') |
                            (df_videos_cleaned_v1['Video_ID']=='b68xiEj9iyc') |
                            (df_videos_cleaned_v1['Video_ID']=='2wcnuRSMURw') |
                            (df_videos_cleaned_v1['Video_ID']=='HL-DFblqDxc') |
                            (df_videos_cleaned_v1['Video_ID']=='TzWh3hBJZEs') |
                            (df_videos_cleaned_v1['Video_ID']=='PzWIch-kh3c') |
                            (df_videos_cleaned_v1['Video_ID']=='XXsoswLThAI') |
                            (df_videos_cleaned_v1['Video_ID']=='7WL1Vw4g8OA') |
                            (df_videos_cleaned_v1['Video_ID']=='DCkyNT39TUA') |
                            (df_videos_cleaned_v1['Video_ID']=='NC2HECcxYjk') |
                            (df_videos_cleaned_v1['Video_ID']=='oQx2TNvnO7Q') |
                            (df_videos_cleaned_v1['Video_ID']=='x7npmocd2Gk') |
                            (df_videos_cleaned_v1['Video_ID']=='N7t7sI8UPDI') |
                            (df_videos_cleaned_v1['Video_ID']=='AQRLlMrwo7U') |
                            (df_videos_cleaned_v1['Video_ID']=='5FtSC0RqmfQ') |
                            (df_videos_cleaned_v1['Video_ID']=='S_h2mHRe_hE') |
                            (df_videos_cleaned_v1['Video_ID']=='9QBoh4U5WC4') |
                            (df_videos_cleaned_v1['Video_ID']=='u-JlA-fk_xM') |
                            (df_videos_cleaned_v1['Video_ID']=='u7RO3Eeu1PI') |
                            (df_videos_cleaned_v1['Video_ID']=='_grMQithgA0') |
                            (df_videos_cleaned_v1['Video_ID']=='ug2_GWkbF1o') |
                            (df_videos_cleaned_v1['Video_ID']=='brtr9vSxh7M') |
                            (df_videos_cleaned_v1['Video_ID']=='fyi9g5AkBkY') |
                            (df_videos_cleaned_v1['Video_ID']=='YDZzvisSsvQ') |
                            (df_videos_cleaned_v1['Video_ID']=='79GibY7iTZg') |
                            (df_videos_cleaned_v1['Video_ID']=='oYU89WNmH8g') |
                            (df_videos_cleaned_v1['Video_ID']=='PESfQZiv5KQ') |
                            (df_videos_cleaned_v1['Video_ID']=='PK5b6Zd7thg') |
                            (df_videos_cleaned_v1['Video_ID']=='9_nf0zaLy2Y') |
                            (df_videos_cleaned_v1['Video_ID']=='hw0KmIZi2g8') |
                            (df_videos_cleaned_v1['Video_ID']=='CUCzCVa_7cY') |
                            (df_videos_cleaned_v1['Video_ID']=='lbOvcKqEdU0') |
                            (df_videos_cleaned_v1['Video_ID']=='xy0aNUVLPpY') |
                            (df_videos_cleaned_v1['Video_ID']=='YV9e6tYWgIU') |
                            (df_videos_cleaned_v1['Video_ID']=='sF0hIo0VAuk') |
                            (df_videos_cleaned_v1['Video_ID']=='CTTKFUzeWI0') |
                            (df_videos_cleaned_v1['Video_ID']=='ts-TptT_NZI') |
                            (df_videos_cleaned_v1['Video_ID']=='iHxAPsmos4M') |
                            (df_videos_cleaned_v1['Video_ID']=='IFYrFeAygfc') |
                            (df_videos_cleaned_v1['Video_ID']=='LgXL6q7vdm8') |
                            (df_videos_cleaned_v1['Video_ID']=='TwGAQ2tuJPg') |
                            (df_videos_cleaned_v1['Video_ID']=='W0kzME-kSFk') |
                            (df_videos_cleaned_v1['Video_ID']=='Q6e6oCXbbPo') |
                            (df_videos_cleaned_v1['Video_ID']=='kuiUq_TCRJ0') |
                            (df_videos_cleaned_v1['Video_ID']=='M_SiX-ASntY') |
                            (df_videos_cleaned_v1['Video_ID']=='xjE311nBsXg') |
                            (df_videos_cleaned_v1['Video_ID']=='ejn--JYXsNU') |
                            (df_videos_cleaned_v1['Video_ID']=='KYePUROUoAs') |
                            (df_videos_cleaned_v1['Video_ID']=='jJjtYb-FhYU') |
                            (df_videos_cleaned_v1['Video_ID']=='kebonpz4bD0') |
                            (df_videos_cleaned_v1['Video_ID']=='_gm8YzjDkeI') |
                            (df_videos_cleaned_v1['Video_ID']=='k6xfTJzoEtY') |
                            (df_videos_cleaned_v1['Video_ID']=='CgOfNkP8kJo') |
                            (df_videos_cleaned_v1['Video_ID']=='rW9P5EyiiMU') |
                            (df_videos_cleaned_v1['Video_ID']=='2MQ46Es9rTU') |
                            (df_videos_cleaned_v1['Video_ID']=='KZ9Wf9wP0UM') |
                            (df_videos_cleaned_v1['Video_ID']=='A7yyPn3_18E') |
                            (df_videos_cleaned_v1['Video_ID']=='Wtcz4zi1IK4') |
                            (df_videos_cleaned_v1['Video_ID']=='Y0amF_F6EvI') |
                            (df_videos_cleaned_v1['Video_ID']=='e9KXzTdqV80') |
                            (df_videos_cleaned_v1['Video_ID']=='ViFgKUbGxHo') |
                            (df_videos_cleaned_v1['Video_ID']=='rGKBRacvasg') |
                            (df_videos_cleaned_v1['Video_ID']=='ra9SufwwaRk') |
                            (df_videos_cleaned_v1['Video_ID']=='HCBbJKs5SgA') |
                            (df_videos_cleaned_v1['Video_ID']=='5HOsl9WEvX8') |
                            (df_videos_cleaned_v1['Video_ID']=='Dq1ddyBapRA') |
                            (df_videos_cleaned_v1['Video_ID']=='0FNHMZJ2A0k') |
                            (df_videos_cleaned_v1['Video_ID']=='efQmjeCZiiU') |
                            (df_videos_cleaned_v1['Video_ID']=='kjz1YPA0WTI') |
                            (df_videos_cleaned_v1['Video_ID']=='YAdPCXpwj6I') |
                            (df_videos_cleaned_v1['Video_ID']=='NftVyw7gQw0') |
                            (df_videos_cleaned_v1['Video_ID']=='osy4iN_ae-E') |
                            (df_videos_cleaned_v1['Video_ID']=='ED8x64yOrQY') |
                            (df_videos_cleaned_v1['Video_ID']=='VIoO953WF5c') |
                            (df_videos_cleaned_v1['Video_ID']=='SSj__PzxOu4') |
                            (df_videos_cleaned_v1['Video_ID']=='dEBmJ1XsMYI') |
                            (df_videos_cleaned_v1['Video_ID']=='1PZkupf3Y7k')].index

df_videos_cleaned_v2 = df_videos_cleaned_v1.drop(rows_to_drop).reset_index(drop=True)

In [10]:
df_videos_cleaned_v2.loc[3, 'Duration'] = '15:00'
df_videos_cleaned_v2.loc[49, 'Duration'] = '6:10'
df_videos_cleaned_v2.loc[63, 'Duration'] = '9:39'
df_videos_cleaned_v2.loc[64, 'Duration'] = '7:23'
df_videos_cleaned_v2.loc[83, 'Duration'] = '12:57'
df_videos_cleaned_v2.loc[258, 'Duration'] = '10:29'
df_videos_cleaned_v2.loc[285, 'Duration'] = '24:28'
df_videos_cleaned_v2.loc[288, 'Duration'] = '7:42'
df_videos_cleaned_v2.loc[304, 'Duration'] = '3:15'
df_videos_cleaned_v2.loc[315, 'Duration'] = '17:24'
df_videos_cleaned_v2.loc[316, 'Duration'] = '5:54'
df_videos_cleaned_v2.loc[337, 'Duration'] = '6:59'
df_videos_cleaned_v2.loc[363, 'Duration'] = '13:08'
df_videos_cleaned_v2.loc[372, 'Duration'] = '8:13'
df_videos_cleaned_v2.loc[380, 'Duration'] = '6:21'
df_videos_cleaned_v2.loc[394, 'Duration'] = '3:17'
df_videos_cleaned_v2.loc[412, 'Duration'] = '6:18'
df_videos_cleaned_v2.loc[427, 'Duration'] = '8:03'
df_videos_cleaned_v2.loc[460, 'Duration'] = '25:40'
df_videos_cleaned_v2.loc[475, 'Duration'] = '8:10'
df_videos_cleaned_v2.loc[488, 'Duration'] = '19:23'
df_videos_cleaned_v2.loc[502, 'Duration'] = '9:18'
df_videos_cleaned_v2.loc[504, 'Duration'] = '32:05'
df_videos_cleaned_v2.loc[515, 'Duration'] = '16:27'
df_videos_cleaned_v2.loc[516, 'Duration'] = '24:17'
df_videos_cleaned_v2.loc[518, 'Duration'] = '15:31'
df_videos_cleaned_v2.loc[519, 'Duration'] = '22:41'
df_videos_cleaned_v2.loc[535, 'Duration'] = '18:04'
df_videos_cleaned_v2.loc[550, 'Duration'] = '13:43'
df_videos_cleaned_v2.loc[562, 'Duration'] = '26:09'
df_videos_cleaned_v2.loc[576, 'Duration'] = '25:22'
df_videos_cleaned_v2.loc[577, 'Duration'] = '11:40'
df_videos_cleaned_v2.loc[588, 'Duration'] = '3:36'
df_videos_cleaned_v2.loc[589, 'Duration'] = '1:43'
df_videos_cleaned_v2.loc[602, 'Duration'] = '39:18'
df_videos_cleaned_v2.loc[603, 'Duration'] = '8:49'
df_videos_cleaned_v2.loc[615, 'Duration'] = '8:59'
df_videos_cleaned_v2.loc[628, 'Duration'] = '8:03'
df_videos_cleaned_v2.loc[637, 'Duration'] = '3:13'
df_videos_cleaned_v2.loc[649, 'Duration'] = '17:34'
df_videos_cleaned_v2.loc[664, 'Duration'] = '8:21'
df_videos_cleaned_v2.loc[665, 'Duration'] = '11:16'
df_videos_cleaned_v2.loc[689, 'Duration'] = '18:10'
df_videos_cleaned_v2.loc[716, 'Duration'] = '36:14'
df_videos_cleaned_v2.loc[718, 'Duration'] = '11:04'
df_videos_cleaned_v2.loc[719, 'Duration'] = '37:35'
df_videos_cleaned_v2.loc[731, 'Duration'] = '13:29'
df_videos_cleaned_v2.loc[742, 'Duration'] = '3:34'
df_videos_cleaned_v2.loc[751, 'Duration'] = '18:33'
df_videos_cleaned_v2.loc[754, 'Duration'] = '8:36'
df_videos_cleaned_v2.loc[755, 'Duration'] = '2:45'
df_videos_cleaned_v2.loc[766, 'Duration'] = '19:40'
df_videos_cleaned_v2.loc[773, 'Duration'] = '12:46'
df_videos_cleaned_v2.loc[790, 'Duration'] = '5:55'
df_videos_cleaned_v2.loc[792, 'Duration'] = '16:43'
df_videos_cleaned_v2.loc[794, 'Duration'] = '12:31'
df_videos_cleaned_v2.loc[802, 'Duration'] = '10:44'
df_videos_cleaned_v2.loc[803, 'Duration'] = '16:35'
df_videos_cleaned_v2.loc[804, 'Duration'] = '9:22'
df_videos_cleaned_v2.loc[812, 'Duration'] = '46:42'
df_videos_cleaned_v2.loc[820, 'Duration'] = '44:46'
df_videos_cleaned_v2.loc[821, 'Duration'] = '16:21'
df_videos_cleaned_v2.loc[822, 'Duration'] = '16:52'
df_videos_cleaned_v2.loc[839, 'Duration'] = '25:31'
df_videos_cleaned_v2.loc[840, 'Duration'] = '4:59'
df_videos_cleaned_v2.loc[866, 'Duration'] = '31:50'
df_videos_cleaned_v2.loc[867, 'Duration'] = '9:10'
df_videos_cleaned_v2.loc[868, 'Duration'] = '4:57'
df_videos_cleaned_v2.loc[881, 'Duration'] = '12:41'
df_videos_cleaned_v2.loc[882, 'Duration'] = '1:03:08'
df_videos_cleaned_v2.loc[894, 'Duration'] = '8:17'
df_videos_cleaned_v2.loc[895, 'Duration'] = '6:36'
df_videos_cleaned_v2.loc[896, 'Duration'] = '51:25'
df_videos_cleaned_v2.loc[915, 'Duration'] = '22:36'
df_videos_cleaned_v2.loc[928, 'Duration'] = '6:59'
df_videos_cleaned_v2.loc[943, 'Duration'] = '15:24'
df_videos_cleaned_v2.loc[978, 'Duration'] = '16:08'
df_videos_cleaned_v2.loc[992, 'Duration'] = '17:01'
df_videos_cleaned_v2.loc[993, 'Duration'] = '10:17'
df_videos_cleaned_v2.loc[1008, 'Duration'] = '14:46'
df_videos_cleaned_v2.loc[1028, 'Duration'] = '5:45'
df_videos_cleaned_v2.loc[1038, 'Duration'] = '1:06:05'
df_videos_cleaned_v2.loc[1054, 'Duration'] = '10:28'
df_videos_cleaned_v2.loc[1063, 'Duration'] = '43:08'
df_videos_cleaned_v2.loc[1067, 'Duration'] = '18:49'
df_videos_cleaned_v2.loc[1068, 'Duration'] = '5:03'
df_videos_cleaned_v2.loc[1086, 'Duration'] = '6:26'
df_videos_cleaned_v2.loc[1124, 'Duration'] = '15:15'
df_videos_cleaned_v2.loc[1126, 'Duration'] = '15:52'
df_videos_cleaned_v2.loc[1141, 'Duration'] = '10:09'
df_videos_cleaned_v2.loc[1143, 'Duration'] = '9:31'
df_videos_cleaned_v2.loc[1144, 'Duration'] = '10:02'
df_videos_cleaned_v2.loc[1145, 'Duration'] = '6:11'
df_videos_cleaned_v2.loc[1158, 'Duration'] = '57:58'
df_videos_cleaned_v2.loc[1161, 'Duration'] = '10:00'
df_videos_cleaned_v2.loc[1172, 'Duration'] = '5:38'
df_videos_cleaned_v2.loc[1173, 'Duration'] = '16:36'
df_videos_cleaned_v2.loc[1174, 'Duration'] = '9:51'
df_videos_cleaned_v2.loc[1184, 'Duration'] = '20:53'
df_videos_cleaned_v2.loc[1185, 'Duration'] = '38:55'
df_videos_cleaned_v2.loc[1194, 'Duration'] = '16:38'
df_videos_cleaned_v2.loc[1206, 'Duration'] = '11:44'
df_videos_cleaned_v2.loc[1217, 'Duration'] = '17:29'
df_videos_cleaned_v2.loc[1218, 'Duration'] = '12:08'
df_videos_cleaned_v2.loc[1219, 'Duration'] = '13:45'
df_videos_cleaned_v2.loc[1230, 'Duration'] = '13:06'
df_videos_cleaned_v2.loc[1240, 'Duration'] = '30:08'
df_videos_cleaned_v2.loc[1241, 'Duration'] = '14:58'
df_videos_cleaned_v2.loc[1253, 'Duration'] = '8:24'
df_videos_cleaned_v2.loc[1263, 'Duration'] = '9:10'
df_videos_cleaned_v2.loc[1264, 'Duration'] = '8:15'
df_videos_cleaned_v2.loc[1273, 'Duration'] = '3:14'
df_videos_cleaned_v2.loc[1284, 'Duration'] = '36:26'
df_videos_cleaned_v2.loc[1292, 'Duration'] = '7:24'
df_videos_cleaned_v2.loc[1304, 'Duration'] = '3:51'
df_videos_cleaned_v2.loc[1313, 'Duration'] = '9:04'
df_videos_cleaned_v2.loc[1314, 'Duration'] = '21:44'
df_videos_cleaned_v2.loc[1332, 'Duration'] = '6:25'
df_videos_cleaned_v2.loc[1359, 'Duration'] = '5:59'
df_videos_cleaned_v2.loc[1380, 'Duration'] = '18:21'
df_videos_cleaned_v2.loc[1382, 'Duration'] = '2:13'
df_videos_cleaned_v2.loc[1399, 'Duration'] = '18:14'
df_videos_cleaned_v2.loc[1401, 'Duration'] = '12:15'
df_videos_cleaned_v2.loc[1409, 'Duration'] = '8:36'
df_videos_cleaned_v2.loc[1415, 'Duration'] = '23:02'
df_videos_cleaned_v2.loc[1429, 'Duration'] = '4:23'
df_videos_cleaned_v2.loc[1430, 'Duration'] = '14:05'
df_videos_cleaned_v2.loc[1440, 'Duration'] = '26:47'
df_videos_cleaned_v2.loc[1441, 'Duration'] = '18:58'
df_videos_cleaned_v2.loc[1455, 'Duration'] = '1:02:33'
df_videos_cleaned_v2.loc[1456, 'Duration'] = '20:11'
df_videos_cleaned_v2.loc[1471, 'Duration'] = '8:54'
df_videos_cleaned_v2.loc[1472, 'Duration'] = '11:09'
df_videos_cleaned_v2.loc[1487, 'Duration'] = '21:15'
df_videos_cleaned_v2.loc[1488, 'Duration'] = '13:23'
df_videos_cleaned_v2.loc[1500, 'Duration'] = '23:31'
df_videos_cleaned_v2.loc[1502, 'Duration'] = '24:46'
df_videos_cleaned_v2.loc[1503, 'Duration'] = '19:26'
df_videos_cleaned_v2.loc[1514, 'Duration'] = '7:13'
df_videos_cleaned_v2.loc[1515, 'Duration'] = '26:23'
df_videos_cleaned_v2.loc[1516, 'Duration'] = '3:57'
df_videos_cleaned_v2.loc[1528, 'Duration'] = '8:02'
df_videos_cleaned_v2.loc[1529, 'Duration'] = '44:34'
df_videos_cleaned_v2.loc[1530, 'Duration'] = '12:12'
df_videos_cleaned_v2.loc[1544, 'Duration'] = '12:27'
df_videos_cleaned_v2.loc[1545, 'Duration'] = '46:04'
df_videos_cleaned_v2.loc[1546, 'Duration'] = '8:13'
df_videos_cleaned_v2.loc[1547, 'Duration'] = '44:52'
df_videos_cleaned_v2.loc[1563, 'Duration'] = '18:31'
df_videos_cleaned_v2.loc[1585, 'Duration'] = '2:25'
df_videos_cleaned_v2.loc[1586, 'Duration'] = '11:47'
df_videos_cleaned_v2.loc[1587, 'Duration'] = '11:41'
df_videos_cleaned_v2.loc[1598, 'Duration'] = '14:55'
df_videos_cleaned_v2.loc[1599, 'Duration'] = '24:03'
df_videos_cleaned_v2.loc[1600, 'Duration'] = '17:47'
df_videos_cleaned_v2.loc[1617, 'Duration'] = '18:41'
df_videos_cleaned_v2.loc[1619, 'Duration'] = '17:26'
df_videos_cleaned_v2.loc[1620, 'Duration'] = '17:59'
df_videos_cleaned_v2.loc[1633, 'Duration'] = '6:24'
df_videos_cleaned_v2.loc[1660, 'Duration'] = '11:14'
df_videos_cleaned_v2.loc[1664, 'Duration'] = '13:54'
df_videos_cleaned_v2.loc[1700, 'Duration'] = '15:30'
df_videos_cleaned_v2.loc[1701, 'Duration'] = '1:01:08'
df_videos_cleaned_v2.loc[1713, 'Duration'] = '24:15'
df_videos_cleaned_v2.loc[1714, 'Duration'] = '14:41'
df_videos_cleaned_v2.loc[1716, 'Duration'] = '26:05'
df_videos_cleaned_v2.loc[1730, 'Duration'] = '9:14'
df_videos_cleaned_v2.loc[1738, 'Duration'] = '12:21'
df_videos_cleaned_v2.loc[1740, 'Duration'] = '58:47'
df_videos_cleaned_v2.loc[1753, 'Duration'] = '7:31'
df_videos_cleaned_v2.loc[1754, 'Duration'] = '3:50'
df_videos_cleaned_v2.loc[1755, 'Duration'] = '14:15'
df_videos_cleaned_v2.loc[1770, 'Duration'] = '25:06'
df_videos_cleaned_v2.loc[1793, 'Duration'] = '13:44'
df_videos_cleaned_v2.loc[1794, 'Duration'] = '18:29'
df_videos_cleaned_v2.loc[1804, 'Duration'] = '30:31'
df_videos_cleaned_v2.loc[1816, 'Duration'] = '8:09'
df_videos_cleaned_v2.loc[1818, 'Duration'] = '10:03'
df_videos_cleaned_v2.loc[1833, 'Duration'] = '9:02'
df_videos_cleaned_v2.loc[1834, 'Duration'] = '11:11'

rows_to_drop = df_videos_cleaned_v2[(df_videos_cleaned_v2['Video_ID']=='sKueO5LQoV8') |
                            (df_videos_cleaned_v2['Video_ID']=='olhzNmgd0O4') |
                            (df_videos_cleaned_v2['Video_ID']=='S5ex8CKVnEo') |
                            (df_videos_cleaned_v2['Video_ID']=='ptHTiHiJMIU') |
                            (df_videos_cleaned_v2['Video_ID']=='A3n6rGYsmME') |
                            (df_videos_cleaned_v2['Video_ID']=='zkCg8p1HeKc')].index
                                    
df_videos_cleaned_v2 = df_videos_cleaned_v2.drop(rows_to_drop).reset_index(drop=True)

In [11]:
df_videos_cleaned_v3 = df_preprocessing_pipeline2(df_videos_cleaned_v2).reset_index(drop=True)

### Grab only the text from the Transcript column (each cell contains a dictionary of text, start time, and duration)

In [12]:
df_videos_cleaned_v4 = grab_transcript_text(df_videos_cleaned_v3)

### Pickle the cleaned dataframe 

In [13]:
pickle_df('df_videos_cleaned_v4')