In [2]:
import pandas as pd
import regex as re

In [593]:
tedtalks=pd.read_csv('~/data/ted_talks_en.csv')[['talk_id','transcript']]

### Cleaning Text

In [586]:
parentheses=r'\([^)(]+[^)( ] *\)'
parenthesestokeep=r'\([^)(]+[^)(.!?—\-, ] *\)'
speakertag=r'(?<=[^\w\d \",] )(?![?\.,!:\-\—\[\]\(\)])(?:[A-Z][^\s.?!\[\]\(\)]*\s?)*:(?=[^\w]*[A-Z])'#lookahead keeps semicolon in false cases.
parenthesestoremove=r'\(([^\w]*[^\(\)]+[\w ]+)\):?'
parenthesesaroundsentence=r'\(([^\w]*[^\(\)]+\W*)\):?'
squarebracketsaroundsentence=r'\[([^\[\]]+)\]' #generic since it seems like the square brackets just denote unclear speech.

def displayinstances(col,exp):
    for i in range(len(col)):
    #     temp={x.group() for x in re.finditer( , tedtalks[i])}
        temp={x.group() for x in re.finditer(exp, col[i])}
        if len(temp)!=0:print(i,temp)
    print('--fin--')

''' Identifies term to remove if the words from the previous 
    punctuation (except ") through : until the next word all 
    begins with a caps. Drawback:This doesnt properly capture 
    places where the following term is caps due to it being a 
    proper noun, where the prefix will be removed regardless 
    but will not break the syntax.
'''

def removespeakertags(text):
    return re.sub(speakertag,' ',text)

def removeparentheses(text):
    return re.sub(parenthesestoremove, ' ',text)

def removeparenthesesaroundsentence(text):
    return re.sub(parenthesesaroundsentence,r'\g<1>',text)

def removesquarebrackets(text):
    return re.sub(squarebracketsaroundsentence, r'\g<1>',text)

def removemusic(text):
    text = re.sub(r'♫( *[^♫ ])+ *♫', ' ',text)
    return re.sub(r'♪( *[^♫ ])+ *♪', ' ',text)

def reducewhitespaces(text):
    return re.sub(r'\s+', ' ',text)

def removeemptyquotes(text):
    text= re.sub(r"'[^\w\d]*'",' ',text)
    text= re.sub(r"\([^\w\d]*\)",' ',text)
    text= re.sub(r"\[[^\w\d]*\]",' ',text)
    return re.sub(r'"[^\w\d]*"',' ',text)

def ellipsistounicode(text):
    text = re.sub(r'\.{3,}(?= )','…',text) #ellipsis without trailing punctuation
    return re.sub(r'\.{3,}([^\w\s])','…\g<1>',text) #ellipsis with trailing punctuation

def removenonsentencepunct(text):
    return re.sub(r'[^\w\d\s,.!?;:$#%&^+•=€²£¥…@\-\–\—\/](?!\w)|(?<!\w)[^\w\d\s,.!?;:$#%&^+•=€²£¥…@\-\–\—\/]',' ',text)

def combinerepeatedpunct(text):
    newtext=[text,re.sub(r'([^\w\d]+) *\1+','\g<1> ',text)]
    i=1
    while (newtext[0]!=newtext[1]):
        i+=1
        newtext[i%2]=re.sub(r'([^\w\d]+) *\1+','\g<1> ',newtext[(1+i)%2])
    return newtext[i%2]

def preprocess(tedtalks):
    print('removing speaker tags')
    tedtalks=tedtalks.apply(removespeakertags)
    
    print('removing non-sentence parenthesis')
    tedtalks=tedtalks.apply(removeparentheses)
    
    print('removing parenthesis')
    tedtalks=tedtalks.apply(removeparenthesesaroundsentence)
    
    print('removing square brackets')
    tedtalks=tedtalks.apply(removesquarebrackets)
    
    print('removing music lyrics')
    tedtalks=tedtalks.apply(removemusic)
    
    print('removing empty tags')
    tedtalks=tedtalks.apply(removeemptyquotes)
    
    print('change to unicode ellipsis')
    tedtalks=tedtalks.apply(ellipsistounicode)
    
    print('removing non-sentence punctuation')
    tedtalks=tedtalks.apply(removenonsentencepunct)
    
    print('combine repeated punctuation')
    tedtalks=tedtalks.apply(combinerepeatedpunct)
    
    print('reduce whitespaces')
    tedtalks=tedtalks.apply(reducewhitespaces)
    
    print('--done--')
    return tedtalks


In [594]:
tedtalks.transcript=preprocess(tedtalks.transcript)
# displayinstances(tedtalks,'[^\w\d\s]{2,}')
# displayinstances(tedtalks,r'[^\w\d\s,.!?;:$#%&^+•=€²£¥…@\-\–\—\/](?!\w)|(?<!\w)[^\w\d\s,.!?;:$#%&^+•=€²£¥…@\-\–\—\/]')
# displayinstances(tedtalks,r"' ")
# tedtalks[1148]

removing speaker tags
removing non-sentence parenthesis
removing parenthesis
removing square brackets
removing music lyrics
removing empty tags
change to unicode ellipsis
removing non-sentence punctuation
combine repeated punctuation
reduce whitespaces
--done--


In [599]:
# tedtalks=tedtalks[tedtalks.transcript.apply(lambda x:len(x.split()))>4]
tedtalks

Unnamed: 0,talk_id,transcript
0,1,"Thank you so much, Chris. And it's truly a gre..."
1,92,"About 10 years ago, I took on the task to teac..."
2,7,"Hello voice mail, my old friend. I've called ..."
3,53,If you're here today — and I'm very happy that...
4,66,Good morning. How are you? Good. It's been gre...
...,...,...
4000,62678,"I'm 14, and I want to go home. My name is Bet..."
4001,62782,"In 1905, psychologists Alfred Binet and Théodo..."
4002,62263,Picture yourself driving down the road tomorro...
4003,62784,"In early 1828, Sojourner Truth approached the ..."


In [600]:
import xml.etree.ElementTree as ET
tree=ET.parse('/home/nxingyu/data/OpenSubtitles/raw/en/2012/2402471/4682144.xml')
# tree=ET.parse('/home/nxingyu/data/OpenSubtitles/raw/en/1497/3204044/5919971.xml')
rows=[]
for child in tree.getroot():
    rows.append(''.join([x.strip() for x in list(child.itertext())]))
' '.join(rows[:-1])

'(Police radio chatter) (Radio chatter continues, indistinct conversations) (Exhales) (Indistinct conversations) (Shutter clicking) Hey, Jane. Got a positive I.D. on the victim. Viktor Mendelssohn. 62. He was a diamond cutter. He has a shop in the San Francisco Diamond district. Same as it ever was-- more bodies, more death. Not exactly the same. This one was eviscerated with an electric rotary saw. Huh. Excuse me. Your toe. Careful. Sorry. Is it me, or does Jane seem a little bit off since the feds took Lorelei? Well, he spent six months setting a trap for Red John. He has nothing to show for it. How would you feel? Maybe we should invite him out for a night with the boys. (Sighs) All right, you ask him. (Clicks, flame whooshes) Well, I-I came out this morning to snip my herbs. It\'s always best to do that right after the morning dew evaporates. (Voice breaking) And when I didn\'t see Viktor, I got a really bad feeling. Do you usually see him in the morning? Every morning. We chat ove

In [630]:
opensubtitles=pd.read_csv("./data/opensubtitles.csv",sep=',',names=['filenames','transcript'])
opensubtitles

Unnamed: 0,filenames,transcript
0,./OpenSubtitles/raw/en/0/1084944/3377035.xml,JUNG Yoo-seok/KO Eun-a CHAE Min-seo/JUNG Young...
1,./OpenSubtitles/raw/en/0/1089124/4995691.xml,I never dreamed before I'm gonna knock the doo...
2,./OpenSubtitles/raw/en/0/1089377/4639810.xml,Susan Polgar lives and works in New York City....
3,./OpenSubtitles/raw/en/0/1100050/6225735.xml,Beautiful. Lollia! Where did you find her? . W...
4,./OpenSubtitles/raw/en/0/1101409/5822576.xml,"OTOGI ZOSHI Traveler, you must not look back W..."
...,...,...
1475,./OpenSubtitles/raw/en/1935/26301/3697912.xml,THE DAY YOU FALL IN LOVE WITH ME Just a moment...
1476,./OpenSubtitles/raw/en/1935/26338/4112002.xml,"NOW, HERE'S GERMANY, AND HERE'S FRANCE. AND OV..."
1477,./OpenSubtitles/raw/en/1935/26362/3445327.xml,STAN AND HARDY IN THE FIXER UPPERS Good mornin...
1478,./OpenSubtitles/raw/en/1935/26388/4184591.xml,"Curt Devlin, Express. Okay. Hi, Curt. I though..."


In [633]:
# encrtedtalks=preprocess(encrtedtalks)
# tedtalks[tedtalks.apply(lambda x:len(x.split()))<1]
# tedtalks.dropna(inplace=True)
#.describe()
# tedtalks[tedtalks.transcript.apply(lambda x:len(x.split()))<4]
opensubtitles.transcript[1478]

"Curt Devlin, Express. Okay. Hi, Curt. I thought I left you in that bird cage at the office. What's a reporter without his cameraman? He's happy. I can get a swell picture.  I can put it up my sleeve.  The last time you did that you got a lovely close-up of your elbow. Now go away and hide someplace and play hermit, will you? Aw, Curt, have a heart. I haven't missed one of these clam bakes in years. Well, this is one you'll miss. Okay, okay, I hope when the dame sits down they blow a fuse. That's all I hope. Hey, you better have one. You look pale. I'm all right. What's that on your forehead, dew? Aw, shut up. What time is it? 11:35. Well, if this isn't a nice way to make a living. You don't think this Gaye dame will take on, do you? Oh, probably. The last one I saw screamed all the way to the chair. That's what I heard about dames. They always dramatize everything. It would have been a good idea if that dame had committed suicide before she got to the dance hall. Then we wouldn't have

In [502]:
def report(index,token):
    if index % 50 == 0:
        print(index,token)

def process(token, index, report=None):
    if report:
        report(index,token) 
    return token
overlap=[process(tedtalks.str.contains(talk).any(),i,report) for i,talk in enumerate(encrtedtalks)]

0 True
50 True
100 False
150 True
200 True
250 True
300 True
350 True
400 False
450 True
500 False
550 True
600 False
650 False
700 True
750 True
800 True
850 True
900 True
950 True
1000 True
1050 True
1100 True
1150 True
1200 True
1250 False
1300 True
1350 True
1400 False
1450 True
1500 False
1550 False
1600 False
1650 False
1700 True
1750 False
1800 False
1850 True
1900 True
1950 True
2000 True
2050 True
2100 True
2150 False
2200 False
2250 False
2300 False
2350 True
2400 True
2450 True
2500 True
2550 False
2600 False
2650 True
2700 True
2750 True
2800 True
2850 True
2900 True
2950 True
3000 True
3050 True
3100 True
3150 False
3200 False
3250 False
3300 True
3350 True
3400 True
3450 False
3500 True
3550 True
3600 True
3650 True
3700 True
3750 False
3800 True
3850 True
3900 True
3950 True
4000 True
4050 False
4100 True
4150 True
4200 False
4250 False
4300 True
4350 True
4400 True
4450 True
4500 True
4550 True
4600 False
4650 True
4700 True
4750 True
4800 True
4850 True
4900 True
4950 

KeyboardInterrupt: 

In [503]:
sum(overlap)/len(overlap)

NameError: name 'overlap' is not defined

In [300]:
# tedtalks[1315].replace('â\x80\x94','winter.')
# tedtalks[1900].replace('\x80\x93','')
# tedtalks[3493]
# !wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer("/home/nxingyu/project/bert-base-uncased-vocab.txt")
encoded_output = tokenizer.encode(tedtalks[1])

# tedtalks[1]
# tk=BertTokenizer.from_pretrained('bert-large-uncased')
# tk.tokenize("don't")


In [171]:
def removemusic(text):
    return re.sub(r'♫([^♫][\w\d\p{P} ]+)♫', ' ',text)

tedtalks.apply(lambda x:len(x.split())).sum(),tedtalks.apply(removemusic).apply(lambda x:len(x.split())).sum()

(7156353, 7140657)