# Preprocessing

In [9]:
import pandas as pd
import regex as re

In [137]:
tedtalks=pd.read_csv('~/data/ted_talks_processed.csv')[['talk_id','transcript']]
len(tedtalks)

3987

In [118]:
tedtalks[tedtalks.transcript.map(lambda x:len(x.split())<10)]
# tedtalks.transcript[761]

Unnamed: 0,talk_id,transcript


### Cleaning Text

In [32]:
parentheses=r'\([^)(]+[^)( ] *\)'
parenthesestokeep=r'\([^)(]+[^)(.!?—\-, ] *\)'
speakertag=r'(?<=[^\w\d \",]) *(?![?\.,!:\-\—\[\]\(\)])(?:[A-Z\d][^\s.?!\[\]\(\)]*\s?)*:(?=[^\w]*[A-Z])'#lookahead keeps semicolon in false cases.
parenthesestoremove=r'\(([^\w]*[^\(\)]+[\w ]+)\):?'
parenthesesaroundsentence=r'\(([^\w]*[^\(\)]+\W*)\):?'
squarebracketsaroundsentence=r'\[([^\[\]]+)\]' #generic since it seems like the square brackets just denote unclear speech.

def displayinstances(col,exp):
    for i in range(len(col)):
    #     temp={x.group() for x in re.finditer( , tedtalks[i])}
        temp={x.group() for x in re.finditer(exp, col[i])}
        if len(temp)!=0:print(i,temp)
    print('--fin--')

''' Identifies term to remove if the words from the previous 
    punctuation (except ") through : until the next word all 
    begins with a caps. Drawback:This doesnt properly capture 
    places where the following term is caps due to it being a 
    proper noun, where the prefix will be removed regardless 
    but will not break the syntax.
'''

def removespeakertags(text):
    return re.sub(speakertag,' ',text)

def removeparentheses(text):
    return re.sub(parenthesestoremove, ' ',text)

def removeparenthesesaroundsentence(text):
    return re.sub(parenthesesaroundsentence,r'\g<1>',text)

def removesquarebrackets(text):
    return re.sub(squarebracketsaroundsentence, r'\g<1>',text)

def removemusic(text):
    text = re.sub(r'♫( *[^♫ ])+ *♫', ' ',text)
    return re.sub(r'♪( *[^♫ ])+ *♪', ' ',text)

def reducewhitespaces(text):
    text=re.sub(r'(?<=[.?!,;:\—\-]) *(?=[.?!,;:\—\-])','',text)
    return re.sub(r'\s+', ' ',text)

def removeemptyquotes(text):
    text= re.sub(r"'[^\w\d]*'",' ',text)
    text= re.sub(r"\([^\w\d]*\)",' ',text)
    text= re.sub(r"\[[^\w\d]*\]",' ',text)
    return re.sub(r'"[^\w\d]*"',' ',text)

def ellipsistounicode(text):
    text = re.sub(r'\.{3,}(?= )','…',text) #ellipsis without trailing punctuation
    return re.sub(r'\.{3,}([^\w\s])','…\g<1>',text) #ellipsis with trailing punctuation

def removenonsentencepunct(text):
    return re.sub(r'[^\w\d\s,.!?;:$#%&^+•=€²£¥…@\-\–\—\/](?!\w)|(?<!\w)[^\w\d\s,.!?;:$#%&^+•=€²£¥…@\-\–\—\/]',' ',text)

def combinerepeatedpunct(text):
    newtext=[text,re.sub(r'([^\w\d]+) *\1+','\g<1> ',text)]
    i=1
    while (newtext[0]!=newtext[1]):
        i+=1
        newtext[i%2]=re.sub(r'([^\w\d]+) *\1+','\g<1> ',newtext[(1+i)%2])
    return newtext[i%2]

def preprocess(tedtalks):
    print('removing speaker tags')
    tedtalks=tedtalks.apply(removespeakertags)
    
    print('removing non-sentence parenthesis')
    tedtalks=tedtalks.apply(removeparentheses)
    
    print('removing parenthesis')
    tedtalks=tedtalks.apply(removeparenthesesaroundsentence)
    
    print('removing square brackets')
    tedtalks=tedtalks.apply(removesquarebrackets)
    
    print('removing music lyrics')
    tedtalks=tedtalks.apply(removemusic)
    
    print('removing empty tags')
    tedtalks=tedtalks.apply(removeemptyquotes)
    
    print('change to unicode ellipsis')
    tedtalks=tedtalks.apply(ellipsistounicode)
    
    print('removing non-sentence punctuation')
    tedtalks=tedtalks.apply(removenonsentencepunct)
    
    print('combine repeated punctuation')
    tedtalks=tedtalks.apply(combinerepeatedpunct)
    
    print('reduce whitespaces')
    tedtalks=tedtalks.apply(reducewhitespaces)
    
    print('--done--')
    return tedtalks


In [594]:
tedtalks.transcript=preprocess(tedtalks.transcript)
# displayinstances(tedtalks,'[^\w\d\s]{2,}')
# displayinstances(tedtalks,r'[^\w\d\s,.!?;:$#%&^+•=€²£¥…@\-\–\—\/](?!\w)|(?<!\w)[^\w\d\s,.!?;:$#%&^+•=€²£¥…@\-\–\—\/]')
# displayinstances(tedtalks,r"' ")
# tedtalks[1148]

removing speaker tags
removing non-sentence parenthesis
removing parenthesis
removing square brackets
removing music lyrics
removing empty tags
change to unicode ellipsis
removing non-sentence punctuation
combine repeated punctuation
reduce whitespaces
--done--


In [599]:
# tedtalks=tedtalks[tedtalks.transcript.apply(lambda x:len(x.split()))>4]
tedtalks

Unnamed: 0,talk_id,transcript
0,1,"Thank you so much, Chris. And it's truly a gre..."
1,92,"About 10 years ago, I took on the task to teac..."
2,7,"Hello voice mail, my old friend. I've called ..."
3,53,If you're here today — and I'm very happy that...
4,66,Good morning. How are you? Good. It's been gre...
...,...,...
4000,62678,"I'm 14, and I want to go home. My name is Bet..."
4001,62782,"In 1905, psychologists Alfred Binet and Théodo..."
4002,62263,Picture yourself driving down the road tomorro...
4003,62784,"In early 1828, Sojourner Truth approached the ..."


In [600]:
import xml.etree.ElementTree as ET
tree=ET.parse('/home/nxingyu/data/OpenSubtitles/raw/en/2012/2402471/4682144.xml')
# tree=ET.parse('/home/nxingyu/data/OpenSubtitles/raw/en/1497/3204044/5919971.xml')
rows=[]
for child in tree.getroot():
    rows.append(''.join([x.strip() for x in list(child.itertext())]))
' '.join(rows[:-1])

'(Police radio chatter) (Radio chatter continues, indistinct conversations) (Exhales) (Indistinct conversations) (Shutter clicking) Hey, Jane. Got a positive I.D. on the victim. Viktor Mendelssohn. 62. He was a diamond cutter. He has a shop in the San Francisco Diamond district. Same as it ever was-- more bodies, more death. Not exactly the same. This one was eviscerated with an electric rotary saw. Huh. Excuse me. Your toe. Careful. Sorry. Is it me, or does Jane seem a little bit off since the feds took Lorelei? Well, he spent six months setting a trap for Red John. He has nothing to show for it. How would you feel? Maybe we should invite him out for a night with the boys. (Sighs) All right, you ask him. (Clicks, flame whooshes) Well, I-I came out this morning to snip my herbs. It\'s always best to do that right after the morning dew evaporates. (Voice breaking) And when I didn\'t see Viktor, I got a really bad feeling. Do you usually see him in the morning? Every morning. We chat ove

In [108]:
for c in ".?!,;:-—":
    print(c,sum(tedtalks.transcript.str.count("\\"+c)))

. 410157
? 36774
! 3277
, 494575
; 5846
: 12950
- 38065
— 33375


In [121]:
# opensubtitles=pd.read_csv("./data/opensubtitles.csv",sep=',',names=['filenames','transcript'])
# opensubtitles
displayinstances(tedtalks.transcript,'[.?!,;:\—]{2,}')
# displayinstances(tedtalks.transcript,'\w+[.?!,;:\—] +[.?!,;:\— ]+ +[A-Z][a-z]+')

7 {'!?'}
12 {'!—', '.—', '.,'}
21 {'?—'}
22 {'.,'}
24 {'.—'}
28 {'?—'}
29 {'?—'}
32 {'.,'}
42 {'?—'}
45 {'.,'}
46 {'!—'}
55 {'.,'}
58 {'.,'}
59 {'.—'}
60 {'!—'}
61 {'.,'}
66 {',—'}
68 {'.,'}
78 {'.,'}
82 {'.,'}
84 {'!—'}
85 {'.—'}
86 {'.—', '.,'}
92 {'.—'}
93 {'.—'}
97 {'.—', '.,'}
104 {'.,', '.:'}
119 {'.,'}
125 {'.,', '?—'}
133 {'.,'}
134 {'?—'}
135 {'.,'}
139 {'.,'}
141 {'.,'}
142 {'?.', '.,'}
149 {'?—'}
152 {'!—'}
168 {'.,', '—,'}
170 {'!—'}
174 {'?—'}
176 {'?—'}
182 {'.,'}
183 {'!—', '?!'}
186 {'?!'}
192 {'?—'}
193 {'!—'}
195 {'.,'}
196 {'!—', '.,', '—?', '.?'}
197 {'.,'}
198 {'.,'}
209 {'.,'}
210 {'.—', '.,'}
212 {'!—'}
225 {'.,'}
227 {'.,'}
231 {'.—'}
232 {'.,', '.?.'}
236 {'.,'}
241 {',—'}
243 {'.,'}
244 {'.,'}
264 {'.,'}
270 {'?—'}
271 {'.,'}
275 {'.,'}
281 {'!—', '?—'}
282 {'.,'}
283 {'.,'}
286 {'!—'}
290 {'.,'}
292 {':.'}
298 {'.,'}
306 {'.—', '.,'}
309 {'.—', '.,'}
311 {'.,'}
312 {'—?'}
316 {',—', '.—'}
317 {',—'}
320 {'.—', '.,'}
321 {'.,'}
323 {'.,'}
325 {'.,'}
333 {'!—'}

In [64]:
displayinstances(tedtalks.transcript,'[\w\d]+ *\- *[\w\d]+')

0 {'G-V', 'limited-use', 'Low-cost', 'end-use', 'low-hanging', 'click-through', 're-brand', 'I-40', 'low-cost', 're-purposed', 'consumer-friendly', 'hot-button', '30-second', 'rear-view', '28-second', 'carbon-neutral', 'short-term', 'beach-combing'}
1 {'hypothesis-generating', 'sub-Saharan', 'high-income', 'well-used'}
2 {'Uh-uh', 'sub-menus', '9-1', 'add-ons', 'log-off', 'drop-down', 'real-world', 'un-retouched', 'pop-up', 'Pre-sweat'}
3 {'top-down', 'day-to', 'hard-earned', 'green-collar', 'on-street', 'community-friendly', 'walk-to', 'well-loved', 'third-world', 'highway-expansion', 'land-use', 'urban-planning', 're-envision', 'working-class', 'street-end', 'hyper-exploit', 'Red-lining', 'first-phase', 'good-looking', 'anti-development', 'mass-transit', 'post-graduate', 'well-paying', 'big-box', 'one-and', 'environmentally-challenged', 'seed-grant', 'people-first', 'food-distribution', 'community-led', 'rapidly-growing', 'Low-income', 'well-meaning', 'post-Katrina', 'low-income', 'o

63 {'three- to', 'e-venture', 'non-governmental', 'uncertain-payment', 'one-dollar', 'industry-right', 'four-tenths', '10-year', 'certain-payment', 'per-capita', 'world-class', 'low-volume', 'a-half', 'high-volume', 'high-margin', 'two-and', 'non-democratic', 'full-time', 'less-than', 'well-deserved', 'second-line', 'five- to', 'high-quality', 'low-margin', 'three-tenths', 'a-day', 'four-hundred'}
64 {'vapor-compression', 'multi-billion', '150-pound', 'high-speed', 'long-range', 'watt-hours', 'techno-geeks', 'half-hour', 'nickel-metal', 'green-space', 'nickel-cadmium', '30-minute', 're-energize', 'un-park', 'high-volume', 'counter-flow', '100-year', '120-volt', '000-pound', 're-condense', 'large-weight', 're-park'}
65 {'spring-loaded', '100-pound', 'high-speed', '300-pound', 'self-destruct', 'X-axis', 'well-established', 'Y-axis', 'saddle-shaped'}
66 {'co-mentor', 'ex-house', 'people-work', 'half-a', 'super-waldo', 'brute-iful', 'self-moving', 'car-y', 'stressed-out'}
67 {'cheek-flappi

141 {'solar-powered', 'cutting-edge', 'one-mile', '000-pound', 'Gossamer-Condor', 'SR-71', '000-dollar', 'on-board', 'high-resolution', 'self-land', 'human-powered', 'Battery-powered'}
142 {'Earth-like', 'in-depth', 'edge-on', 'blue-ocean', 'color-code', 'low-lying', 'European-built'}
143 {'Post-Napster', 'get-go', 'Coca-Cola', 'car-boat', 'twenty-fifth', 'self-sustaining', 'know-how'}
144 {'self-models', 'self-replicate', 'self-replication', 'micro-scale', 'self-model', 'x-ray'}
145 {'op-ed', 'look-alike', 'lie-down', 'dry-cleaning', 'murder-mystery', 'wonderful-looking', 'Gell-Mann', 'full-time', 'simple-minded'}
146 {'e-mail', 'never-ending', 'street-up', 'reverse-engineer', 'out-of', 'micro-lending'}
147 {'so-called', 'number-color', 'Ninety-nine', 'cross-model', 'tone-color', 'three-pound', 'self-awareness', 'cross-connected', 'across-the', 'ki-ki'}
148 {'profit-minded', 'cookie-cutter', 'in-house', 'back-to', 'pan-African', 'under-capitalized', 'low-cost', 'mind-boggling', 'Well-

218 {'computer-oriented', 'high-speed', 'S-Curve', 'five-ball', 'three-dimensionally', 'hand-oriented'}
219 {'nine-tenths', 'hyper-adapted'}
220 {'J-School', 'e-content', 'one-person', 'mini-bureaus'}
221 {'steroid-filled', 'farm-raised', 'fat-phobic', 'oil-based', 'fend-for', 'beta-carotene', 'fast-food', 'Earth-friendly', 'low-carb', 'anti-atom', 'proto-feminist', 'home-cooked', 'value-added', 'Pop-Tart', 'second-highest', 'industry-obsessed', 'health- and', 'one-fifth', 'hyper-consumption', 'semi-vegetarian', 'five-fold', 'so-called', 'God-like', 'low-fat', 'store-bought', 'atmosphere-altering', 'So-called', 'anti-cattle', 'well-intentioned', 'forward-thinking'}
222 {'high-bandwidth', 'I-2', '16-minute', 'commercial-grade', 'blue-gold', 'mid-ocean', 'one-year'}
223 {'mom-and', 'high-sugar', 'three-dimensional', 'Wi-Fi', 'civic-type', 'partnership-type'}
224 {'four-bar', 'stand-in', 'wire-bending'}
225 {'multi-parameter', 'price-wise', '12-volt', 'on-site', 'open-heart', 'X-rays', 'O

298 {'baseball-loving', 'anti-Vietnam', 'red-checked', 'two-hour', 'hard-scrabble'}
299 {'day-old', 'a-half', 'post-9', 'one-to', 'one-way', 'right-hand', 'two-way', 'two-and', 'first-mover', 'well-linked', 'decision-making', 'higher-level', 'self-organizing'}
301 {'Pati-Salam', 'high-dimensional', 'four-dimensional', 'space-time', 'spin-down', 'eight-dimensional', 'start-up', 'right-handed', 'two-dimensional', 'spin-up', 'six-dimensional', 'left-handed', 'Gell-Mann', 'full-time', 'left- and'}
302 {'go-back', 'Tele-presence', 'high-speed', 'in-vitro', 'Pac-Man', 'blown-glass', 'go-to', 'object-based', 'brand-new', 'mind-opening', '3D-print', 'Auger-Loizeau'}
303 {'alien-style', 'good-looking', 'Hewlett-Packard', 'self-interest', 'not-glamorous', 'make-up', 'Bourke-White', 'spread-spectrum'}
304 {'omega-3', 'the-art', 'low-tech', 'one-gram', 'self-interest', 'half-truth', 'disease-protective', 'anti-smoking', 'fen-phen', 'high-tech', 'Two-thirds', 'high-fat', 'peer-reviewed', 'trans-fat

365 {'Two-and', 'neo-catastrophism', '10-kilometer', 'carbon-60', 'mammal-like', 'Hale-Bopp', 'Earth-like', 'Pre-Cambrian', 'millimeter-thick', 'K-T', 'a-half', 'counter-argument', 'brand-new', 'anti-Gaian', 'Fifty-five', '18-inch', 'crocodile-like', 'large-body', 'deep-Earth', 'Helium-3', 'present-day', 'T-Rex', 'Warm-blooded'}
366 {'warm-up', 'Flex-Foot', 'three-inch', '100-meter', 'sports-wise', 'two-inch', 'record-holder', 'five-ply', 'Flo-Jo', 'able-bodied'}
367 {'IRB-approved', 'parainfluenza-4', 'third-generation', 'Hewlett-Packard', 'blow-up', '28-year', 'all-important', 'parainfluenza-1', 'community-acquired', 'early-onset', 'order-off', 'pattern-matching', 'evolutionary-conserved', 'hand-held', 'the-shelf', '10-day', 'fast-evolving', 'pan-viral', 'parainfluenza-3', 'how-to', 'sarcoma-associated', 'right-hand', 'ultra-conservation'}
368 {'French-Canadian', 'well-known'}
369 {'two-megahertz', 'off-axis', 'up-front', '25-year', '25-degree', '500-watt', 'longer-term', 'not-as', '

451 {'social-economic', 'forward-looking'}
452 {'115-year', 'green-collar', 'Re-Invention', 'Cradle-to'}
453 {'world-changing', '130-year', '10-cent'}
454 {'dopamine-wanting', 'less-fortunate', '10-month', 'post-materialist', 'short-lived', 'all-systems', 'pleasure-seekers', 'self-other', 'self-esteem', 'long-term', 'self-actualization', 'self-control', 'dopamine-fed', 'three-week'}
455 {'self-cleaning', 'split-ends', 'leaf-like', 'close-up', 'nano-size', 'air-righting', 'zero-angular', 'first-born', 'plexi-glass', '60-foot', 'tail-cracking', 'gecko-inspired', 'mid-air'}
456 {'hot-shot', 'failure-syndrome', 'success-to', 'one-way', 'anti-depressants'}
457 {'hide-and', 'three-acre', 'human-made', 'man-made', 'great-great', 'great-grandchildren', 'half-acre', 'next-door', 'mid- 80s'}
458 {'one-to', 'two-way', 'upside-down', 'text-based', 'many-to', 'real-time', 'oil-based'}
460 {'counter-traction', 're-evaluation', 're-set', 'all-important', 'time-consuming'}
461 {'past-negative', 'wishy

545 {'flute-playing', 'Then-Communications', 'all-news', 'eight-year', 'fast-forward', 'anti-semitism', 'coal-fired', 'Indian-ness', 'ever-ever', 'mid-size', '21st-century', 'fifth-largest'}
547 {'disk-like', 'SO-2'}
548 {'000-years', 'four-story', 'multi-million', 'well-versed', 'full-page'}
549 {'figure-ground', 'J-O', 'upside-down', 'physics-based', 'well-designed', 'Amy-Jo', 'well-written'}
550 {'nuclear-free', 'coal-fired', 'right-handed', 'nu-ca', 'left-handed'}
551 {'five-year', 'slavery-like', 'male-dominated', 'full-time', 'third-largest', 'well-to', 'modern-day', 'Ram-ki', 'A-C', 'well-wisher', 'four-year', 'gang-raped'}
552 {'quality-assurance', '120-odd', 'patient-centered', 'well-proven', 'counter-intuitive', 'hub-and', 'high-tech', '20-odd', 'intra-ocular', 'a-half', 'half-million', 'low-cost', 'post-retirement', 'non-customer', 'day-out', 'three-and', 'patient-centric', 'skill-based', 'day-in'}
553 {'moderate-to', 'youth-obsessed', 'arm-wrestle', 'five-volume', 'self-por

636 {'knowledge-based', 'mathematics-based', 'tree-like', 'long-term', 'built-in', 'long-standing'}
637 {'n-dimensional', 'AK-47', 'right-hand', 'Not-for', 'Saint-Exupery', 'year-old'}
638 {'mid-30s', 'freeze-dried', 'self-supporting', 'time-out', 'low-lying', 'Ha-ha', 'Eco-Heroes', 'mahi-mahi', 'world-wide'}
639 {'dressed-up', 'four-word'}
640 {'fine-tuning', 'school-aged', 'policy-making', 'Sub-Saharan', 'under-five'}
641 {'12-point', 'well-funded', 'decision-making', 'touch-tone', 'well-connected', 'under-capitalized', 'highest-quality', 'piece-of', 'flat-screen', 'well-designed', 'cross-section', 'pre-civil', 'mass-market'}
642 {'so-called', 'bottom-dwelling', 'Texas-Mexico', 'well-being', 'dead-zonification', 'not-so'}
643 {'machine-cum', 'two-wheeler', 'second-standard', 'long-tail', 'high-quality', 'so-called', 'non-material', 'food-grade', 'energy-efficient', 'non-stick', 'sub-optimize', 'cycle-based', 'flour-grinding', 'Teflon-like', 'cross-pollinates', 'second-quality'}
645 {

714 {'paleo-record', 'human-dominated', 'action-based', 'feed-in', 'small-scale', 'large-scale', 'flood-prone', 'low-nutrient', 'pre-industrial', 'self-regulating', 'cross-scale', 'mid-50s', 'mid-18th', 'well-being', 'Haber-Bosch', 'air-polluting', 'peri-urban', 'plow-based', 'dead-end', 'low-carbon'}
715 {'heart-to', 'mind-to', 'well-being', 'day-to'}
717 {'e-mail', 'so-called', 'year-old', 'all-terrain', 'foot-long', 'strange-looking'}
718 {'fine-tune', '10-year', 'year-old', '12-year', 'year-olds', 'self-organizing', 'Self-organizing', 'pounds-worth', 'speech-to', 'right-angled'}
719 {'part-time', 'full-figure', 'fine-tuned'}
720 {'intermediate-range', 'screwed-up', 'Super-logical', 'Al-Shabaab', 'non-state', 'C-130', 'leather-soled', 'high-life', 'non-profit', 'multi-party', 'long-suffering', 'close-knit'}
721 {'market-only', 'multi-generational', 'pro-ams', 'tailor-made', 'year-old', 'cross-community', 'self-doubt', 'university-entering'}
722 {'ice-covered', '1982- 83', '97- 98', 

794 {'45-year', 'two-centimeter', '40-pound', 'one-centimeter', 'breast-imaging', 'non-radiologists', 'fatty-replaced', 'X-rays', 'pre-eminent', 'self-interest', 'pain-free', 'Two-thirds', 'pre-menopausal', 'X-ray', '67-year', 'one-fifth'}
795 {'three-year', '300-pound', 'slack-jawed', 'mid-20s', 'year-old', '20-dollar', 'non-profit', 'year-olds', 'life-giving', 'father-daughter', 'six-foot', 'check-out', 'Flash-forward'}
796 {'non-violent', 'F-bomb', 'right-hand', 'heavy-duty', 'full-time'}
797 {'long-term', 'down-time', 'co-creating', 'button-clicking'}
798 {'reverse-engineer', 'hundred-plus', 'modern-day', '20th-century'}
799 {'brother-in', 'terrorist-lovers', 't-know', 'wrong-doing', 'best-selling', 'reef-looking', 'gun-toting', 'it-all', 'intractable-seeming', 'right-doing', 'tongue-in', 'I-don', 'know-it', 'underwater-coral', 'knee-jerk'}
800 {'leveling-up'}
801 {'18-and', 'mountaintop-removal', 'strip-mining', 'a-half', 'carpet-bombed', 'face-to', 'counter-insurgencies', 'much-n

897 {'technique-wise', 'Square-like', 'good-looking', 'well-being'}
898 {'pool-shaped', 'close-up'}
899 {'day-out', 'seven-year', 'outfit-obsessed', 'dress-up', 'day-in'}
900 {'hand-held', 'here-and', 'two-ness'}
901 {'funny-looking', 'sixth-graders', 'cool-looking', 'build-up', 'wolf-like', 'three-fingered', 'sorts-of', 'year-old', '18-wheeler', 'dino-chicken', '65-million', 'Glow-all', 'blood-sucking', 'B-rex', 'circular-looking', 'cooler-looking', 'Glow-rabbits', 'non-avian', 'weird-looking'}
902 {'000-square', 'hi-tech', 'Sola-Morales', '000-pound', 'hard-edged', 'self-portrait', 'hand-tied', 'X-ray'}
903 {'hundred-thousand', 'fact-filled'}
904 {'cross-cultural', 'five-year', '40-year', 'high-risk', 'androgen-sensitive', 'breaking-down', 'proto-gonads', 'make-up', 'sex-difference'}
905 {'six-figure', 'at-risk'}
906 {'self-assemble', 'crowd-source', 'seven-day', 'brain-computer', 'game-changing', 'super-enabling', 'real-time', 'T-shirts', 'Skype-type', 'micro-robots', 'long-term', '

991 {'user-unfriendliness', 'cross-referenced', 'user-unfriendly', 'face-to', '12-hour', 'time-stamp', 'cross-reference', 'user-friendly'}
992 {'three-minute', 'top-down', 'half-life', 'bottom-up', 'decision-makers', 'non-pharmacologic', 'non-religious', 'year-old', 'double-digit'}
993 {'wing-folding', 'the-art', 'single-engine', 'multi-purpose', 'Door-to', 'liquid-based', 'counter-intuitive', 'high-speed', 'two-seat', 'door-to', 'two-dimensional', 'power-to', 'custom-designed', 'off-road', 'low-risk', 'state-of', 'stop-and', 'three-dimensional', 'two-dozen', 'the-box', '30-year', 'continuously-variable', 'out-of'}
994 {'cutting-edge', 'three-year', 'three-tenths', 'year-old', 'two-tenths', 'left-hand', 're-shows'}
995 {'open-source', 'Bye-bye'}
996 {'140-character', 'off-the', '100-kilo'}
997 {'documents - for', 'surgery-bank'}
998 {'tar-fueled', 'self-associate', 'non-life', 'non-living', 'self-replication', 'self-assembly', 'non-equilibrium', 'population-level'}
999 {'upside-down', 

1078 {'Fourth-graders', 'middle-size', 'grown-up', 'duck-bill', 'fourth-graders', 'shape-shifting', 'duck-billed'}
1079 {'photo-realism', 'two-dimensional', 'three-dimensional'}
1080 {'eye-opening'}
1081 {'higher-ups', 'Uh-uh', '40-pound', 'highest-ranking', 'brother- or', 'gung-ho', 'African-Americans', 'chock-full', 'go-ahead', 'non-combat', 'Mexican-American', 'lip-read', 'off-campus'}
1082 {'human-related', 're-registered', 'yolked-up', 'Mexican-American', 'four-fold'}
1083 {'rapid-fire', 'better-looking', 'placebo-controlled', 'mind-numbing', 'chore-loving', 'good-looking', 'far-off', 'life-threatening', 'ballroom-dancing', '50-pound', 'self-reports', 'double-blind', 'side-by'}
1084 {'health-care', 'blood-pressure'}
1085 {'plankton-feeding', 'plankton-eating', 'sea-surface', 'boom-and', 'second-largest', 'a-half', 'three-and', 'high-tech', 'ground-truthed', 'time-consuming', 'best-documented', 'mid-80s', 'well-documented'}
1086 {'hyper-connected', 'Port-au', 'open-source', 'user-a

1165 {'tube-lipped', 'gyro-stabilized', 'a-half', 'two-and', 'three-and', 'nine-foot'}
1166 {'well-illustrated', 'best-trained', 'lower-case', 'catch-up', 'hand-in', '000-plus', 'three-time', 'topic-for', 'word-for'}
1167 {'well-designed', '24-hour'}
1168 {'mass-produce', 'five-octave', 'brand-spanking', 'face-to', 'mass-distributed'}
1169 {'mosqui-toes', 'the-box', 'out-of', 'mosquito-born'}
1170 {'non-Western', 'rose-tinted', 'two-year', 'self-fulfilling', 'deep-rooted'}
1171 {'long-term', 'past-tense', 'Jay-Z', 'well-known'}
1172 {'big-shot', 'K-12', 'non-creatives', 'God-given', 'self-efficacy', 'change-the', 'two-way', 'step-by'}
1173 {'ready-made', 'get-go', 'thousand-year'}
1174 {'two-state'}
1175 {'full-size', 'semi-private', '18-month', 'Forty-two', '8-hour'}
1176 {'cutting-edge', 'micro-channels', 'three-dimensional', 'high-speed', 'micro-scale', 'oxygen-producing', 'tiny-scale', 'human-microbe', 'large-scale', 'laser-scanning'}
1177 {'12-year', '45-minute'}
1178 {'sub-Sahara

1261 {'bi-algal', 'well-known', 'so-called', 'single-celled', 'fastest-growing', 'five-day'}
1262 {'one-bedroom', 'the-art', 'three-wheel', 'technology-enabled', 'bike-sharing', 'one-seventh', 'state-of', '20-minute', 'shared-use', '10-minute', '24-bit', 'low-voltage', 'open-loft', 'five- or', 'five-minute'}
1263 {'17-year', 'well-lit', 'first-degree', 'mid-January', 'photo-occluded', 'high-speed', 'well-versed', 'grandmother-in'}
1264 {'Disability-Adjusted', 'shoulder-to', 'better-resourced', 'so-called', 'capacity-building', 'best-resourced', 'Alma-Ata', 'less-trained', 'third-leading'}
1265 {'self-obsessed', 'all-American', 'entry-level', 'self-driven', 'self-motivated', 'gloved-hand', 'self-improvement'}
1266 {'re-screen', 'sub-types', 'one-size', 'fits-all', 'stand-in'}
1267 {'task-based', 'body-to', 'so-called', 'whoo-aa', 'whoo-um'}
1268 {'crisis-mapping', 'CPR-trained', 'low-tech', 'bottom-up', '18th-century', 'four-month', 'post-election', 'read-only', 'high-tech', 'crowd-fund

1364 {'socio-economic', 'high-level'}
1365 {'all-merciful', 'Gaddafi-like', 'pre-elections', 'war-torn', 'co-founded', 'consensus-building'}
1366 {'Wikipedia-like', '13-year', 'virus-making'}
1367 {'HD-capable', 'one-second', 'day-to', 'one-hour', 'three-month', 'five-hour', 'sister-in'}
1368 {'on-the'}
1369 {'Thirty-seven', 'de-politicized', 'ready-made'}
1371 {'micro-cracks', 'self-healing'}
1372 {'top-down', 'self-organization', 'high-rise', 'highly-funded', 'Jean-Claude', 'long-term'}
1373 {'turn-on', 'million-dollar', 'red-light', 'anti-aphrodisiac', 'self-worth', 'self-sustaining', 'long-term'}
1374 {'60-minute', '40- to', 'Young-ha'}
1375 {'brain-machine', 'larger-scale', 'mid- 60s', 'badly-tuned', 'first-person'}
1376 {'long-run'}
1377 {'bid-rigging', 'so-and', 'not-for', 'then-Prime', 'battered-looking', 'nancy-story', 'government-to', 'resource-rich'}
1378 {'high-performing', 'user-generated', 'High-performing', 'long-standing', 'badly-educated', 'so-so', '15-year', 'lowest-p

1464 {'on-ramp', 'open-source', 'day-to', 'multi-SIM', 'post-election'}
1466 {'mid-2000s', 'rock-solid', 'school-aged', 'hand-selected', 'man-made', 'Seventy-five', 'Twenty-one', 'well-designed', 'not-so', 'developing-world', 'wind-driven'}
1467 {'lamp-like', 'grapefruit-size', 'small-scale'}
1468 {'never-ending', 'fore-ordained', 'self-fulfilling', 'closed-minded', 'ever-accumulating'}
1469 {'27-year', 'multi-disciplinary', 'meta-themes', 'immune-based', 'insulin-resistant', 'under-exercising'}
1470 {'self-reflective', 'pro-Israel', 'well-known', 'T-Cell', 'Ass-head', 'meta-motivational'}
1471 {'blue-green', 'DNA-tested', 'middle-sized', 'early-stage', 'gastric-brooding', 'meat-eating', 'de-extinction'}
1472 {'zero-force', 'gravity-compensated', 'six-dimensional'}
1473 {'hunky-dory', 'meta-narratives', 'entry-level', 'self-correction', 'state-owned', 'Ninety-three', 'second-largest', 'long-suffering', 'little-known', 'Meta-narrative', 'what-have', 'multi-party', 'one-party', 'mind-bog

1559 {'value-based', 'cost-cutting', 'father-in', 'seven-fold', 'well-reputed', 're-operated', 'Martini-Klinik'}
1560 {'two-person', 'last-minute', 'hands-on', 'thumb-wrestling'}
1561 {'never-explored', 'open-source', 'snow-capped', 'radio-collared', 'three-dimensional', 'wi-fi', 'motion-activated', 'labor-intensive', 'heat-emitting', 'high-resolution', 'lawnmower-type', 'time-consuming', 'one-horned', 's-eye', 'crystal-clear'}
1562 {'high-powered', 'laser-guided', 'closed-canopy', 'mega-drought', 'fastest-growing', '100-square', 'high-tech', 'high-resolution', 'first-time', 'not-so'}
1563 {'open-source', 'head-mounted', 'off-the', 'fixer-upper', 'stroke-like'}
1564 {'peer-to', 'sub-Saharan', 'all-season', 'ultra-flexible'}
1565 {'follow-up', 'problem-solve', 'long-term', 'take-home', 'target-rich', 'meaning-making'}
1566 {'hunter-gatherer', 'age-based', 'white-haired', 'high-quality', 'Coca-Cola', '25-year', 'eight-digit', 'able-bodied', 'self-reliance', '15-year', 'self-esteem', 'hun

1652 {'high-quality', 'mock-up', 'car-oriented', 'suburban-style', 'tree-lined', 'two-block', '10-minute', 'mile-long', 'self-seeded', 'pop-up'}
1653 {'first-year', '60-page', 'age-old'}
1654 {'high-speed', 'micro-diamonds', 'time-lapse'}
1655 {'multi-material', 'decade-long', 'single-material'}
1656 {'fact-finding', 'Fact-finding'}
1657 {'self-pity', 'self-confrontation', 'people-pleaser'}
1658 {'non-parents', '50-plus', 'eco-friendly', 'self-confidence', 'self-esteem', 'multi-tasking', 'peer-reviewed', 'six-year', 'science-minded', 'C-section', 'child-rearing', 'single-parent', 'middle-class', 'disease-proof', 'gluten-free', 'work-life', 'candy-colored', 'non-traditional', 'well-intentioned', 'mono-tasking'}
1659 {'self-concept', 'so-and', 'anti-discrimination', 'cross-gender', '10-year', 're-qualify', '16-year', '12-year', 'male-to', '12-16'}
1660 {'Earth-like', 'sub-scale'}
1661 {'mid- 90s', 'plain-spoken', '18-unit', 'high-speed', 'anti-aliasing', 'mid- 60s', 'present-day', 'self-

1767 {'general-purpose', 'face-processing', 'cross-sectional', 'general- purpose'}
1768 {'seven-year', 'day-to'}
1769 {'long-run', 'capital-intensive', 'after-tax', 'cherry-picking', 'pre-World', '1987-2013', 'pre-tax'}
1770 {'well-meaning', 's-moving', 'body-confidence', 'pro-anorexia', 'my-body', 'too-fast', 'self-harm', 'image-related', 'Thirty-one', 'healthy-looking', 'image-obsessed', 'superhero-like', 'for-my', 'self-esteem', 'always-on', 'one-year'}
1771 {'open-source', '18th-century', 'two-way', 'two-month', 'century-designed', '21st-century'}
1772 {'conflict-afflicted', 'M-Pesa', 'part-time'}
1773 {'self-deprecation', 'off-limits', 'all-knowing', '18th- century', '29-year', 'large-scale', '20th-century'}
1774 {'amyloid-beta'}
1775 {'Public-Private', 'win-win', 'cost-effective'}
1776 {'open-source', 'the-art', '3D-printed', '21st-century', '96-well', 'low-cost', 'x-ray', 'state-of', 'DNA-based', '20th-century', 'non-invasive'}
1777 {'grown-ups'}
1778 {'sub-planetary', 'in-betwe

1869 {'life-threatening', 'cross-fertilized', 'post-conflict', 'then-French'}
1870 {'middle-class', 'Over-the', 'child-bearing', 'so-called', 'follow-up', 'anti-depressants'}
1871 {'mind-blowing', 'one-trick', 'Bach-y', 'star-nosed', 'plug-and', 'X-rays', 'consciousness-raiser', 'game-changer', '360-degree', 'X-ray', 'real-time'}
1872 {'oxygen-impermeable', 'self-curing', 'layer-like', '10-minute', 'game-changing', '3D-printed', 'two-dimensional', 'co-inventors', 'water-cooled', 'strength-to', 'fuel-efficient', 'really-game', 'a-chip', 'three-dimensional', 'so-called', 'lab-on', 'nano-fabrication', 'injection-molded', 'T-1000'}
1873 {'27-year', 'top-note', 'e-mail', 'third-party', 'same-sex', 'e-mails', 'long-held', 'stone-throwers', 'meta-analysis', 'Anti-Bullying', 'on- and', 'Fast-forward', 'slut-shaming'}
1874 {'three-year', 'two-dimensional', 'neuron-like', 'real-world', 't-shirt', 'human-like'}
1875 {'counter-jihad', 'dead-end', 'tax-cut', 'six-figure', 'mini-part', 'life-expecta

1968 {'so-called', 'Texas-Austin', 'two-inch', 'one-fiftieth', 'light-years', 'long-wavelength', '80-feet', 'super-Earths', 'Jupiter-like', 'multiple-planet', 'one-millionth', 'five-thousandths'}
1969 {'Mlambo-Ngcuka', 'Secretary-Generals', 'Ki-moon', 'gender-research', 'hand-picking', 'Secretary-General'}
1970 {'whistle-blower', 'low-price', 'multi-signature', 'privacy-enhancing', 'get-one', 'micro-laundering', 'customer-friendly', 'buy-one', 'crypto-currency', 'world-class', 'one-offs', 'high-res', 'whistle-blowers', 'high-quality', 'co-opted', 'censorship-free', 'consumer-centric', 'three-month', 'real-world', 'pop-up'}
1971 {'code-cracker', 'settling-down', 'price-earnings', 'shorter-term', 'sub-field', 'Shiing-Shen', '15-day', 'Chern-Simons', 'up-and', '10-day', 'mid-1700s', 'Trend-following', 'higher-dimensional'}
1972 {'pre-breathing', 'self-contained', 'four-minute', '27-second', 'air-conditioned', '15-year', 'quarter-inch'}
1973 {'soul-deadening', 'TED-like'}
1974 {'right-sizi

2068 {'Sixty-five', 'micro-algae', 'Omega-3s', 'wild-caught', 'one-and', 'a-half', 'better-managed', 'fly-fishing', 'two-and', 'best-managed', 'high-grade', 'Wild-caught', 'resource-efficient', 'far-fetched', 'fish-obsessed', 'plant-based', 'cold-blooded'}
2069 {'evidence-based', 'well-known', 'high-quality', 'well-being', 'African-Americans', 'African-American', '18th-century', 'race-specific', 'self-identified'}
2070 {'first-case', 'Jean-François', 're-implant', 'second-case', 're-implanted', 'doublecortin-positive', 're-implantation', 'neuro-repair', 'non-eloquent', 'self-repair'}
2071 {'anti-vaxxing', 'runner-up', '21st-century', 'open-ended'}
2072 {'work-specific', 'love-specific', 'two- and', '15-hour', '30-second', '13-year', 'non-titan', 'titan-in', 'four-year', 'joy-specific'}
2073 {'co-founder', 'non-spinning', 'Gravitational-Wave'}
2074 {'six-meter', 'high-level', 'fixed-wing', 'so-called', 'tail-sitter', 'tail-sitters', 'remote-controlled', 'spin-off', 'multibillion-dollar'

2167 {'right-hand', 'non-linear', 'man-made', 'Escher-like', 'error-minimization', '20th-century'}
2168 {'19th-century', 'Saint-Exupéry', 'anti-Semite', 'French-Jewish', 'innocuous-looking', 'high-ranking', 'self-worth', 'fair-minded', 'anti-Semitic'}
2169 {'three-inch', 'super-critical'}
2171 {'back-to', 'UN-Habitat', 'part-time', 'half-destroyed', 'self-respect', 'pre-categorized', 'un-modern'}
2172 {'asylum-seekers', 'low-skilled', 'post-Brexit', 'Secretary-General', 'well-educated'}
2173 {'voice-over', 'So-and', 'two-part', 'black-sounding', 'white-sounding', 'self-conscious'}
2174 {'design-thinking', 'fine-tuning', 'real-life', 'pre-determined'}
2175 {'would-be', 'dual-career', 'well-meaning', 'super-supportive', 'male-dominated', 'hard-earned', 'much-spoken', 'part-time', 'well-established', 'stay-at', 'Fast-forward', 'win-win', 'hard-working', '16-plus'}
2176 {'rust-resistant', 'day-to', 'non-native', 'chain-link', 'bio-blitz', 'self-willed', 'Non-native', 'large-bodied', 'saber

2264 {'non-profit', 'trickle-down', 'three-month'}
2266 {'mid- 90s', 'private-sector', 'life-affirming', 'cul-de', '22-mile'}
2267 {'in-laws', 'self-important'}
2268 {'self-determined', 'narrow-minded', 'possibility-modeling', 'pre-transition', 'deep-seated', 'year-long', 'Ottawa-based', 'shape-shifting'}
2269 {'high-powered', 'not-for', 'marble-sized', 'chip-sized', 'end-of', 'TechDemoSat-1', 'long-term', '900-plus', 'low-Earth', 'well-intentioned'}
2270 {'team-building', 'six-year', 'fast-forward', 'rah-rah', '14-year'}
2271 {'soul-spark', 'farm-to', 'mash-up', 'life-or', 'home-birth', 'soul-baring', 'truth-telling', 'fast-paced', 'made-up', 'near-lethal', 'hamster-wheel'}
2272 {'do-it', 'donor-conceived', 'follow-up', 'open-communication', 'non-genetic', 'good-quality', 'well-meant'}
2273 {'itty-bitty', 'know-how', 'population-based'}
2274 {'two-step', 'X-rays', 'baked-in', 'consciousness-free', 'start-ups', 'auto-driven', 'front-facing', 'long-term'}
2275 {'vaccine-like', 'FDA-appr

2357 {'non-Nigerians', 'nine-year', 'self-fulfilling', 'tongue-in', 'well-prepared', 'self-evident', 'well-meaning', 'good-looking', 'un-African'}
2358 {'tent-like', 'nine-pointed', 'soft-flowing'}
2359 {'mind-blowing', 'star-forming', 'Wi-Fi', 'fast-forward', 'light-years', 'high-pitched', 'high-energy', 'blue-white', 'x-rays', 'long-standing', 'All-sky'}
2360 {'thought-provoking', 'first-year', 'self-preservation', '10-year'}
2361 {'mom-and', 'billion-dollar', '73-year', 'wake-up', 'risk-free'}
2362 {'N-D', 'C-A', 'money-related', 'N-O', 'self-worth', 'long-term'}
2363 {'Twenty-one', 'kick-start', 'real-life'}
2364 {'school-going', 'heroes-in', 'decision-making', 'self-serving', 'public-focused', 'ex-Yale', 'cover-ups', 'risk-averse', '21st-century'}
2365 {'techno-economic'}
2366 {'low-quality', 'life-changing', 'career-oriented', '50-50', 'six-month', 'A- game'}
2367 {'mid-stage', 'full-blown', 'hyper-activated', 'disease-modifying', 'slow-wave', 'see-saw', 'all-out', 'yet-undamaged

2454 {'dragnet-style', 'third-party', 'anti-tax', 'wake-up', 'pro-choice', 'pro-life', 'anti-war'}
2455 {'US-led', 'two-dimensional', 'Al-Lāt', 'Fast-forward', 'knee-length'}
2456 {'neo-Nazi', 'so-called', 'al-Walid', 'shaven-headed', 'most-wanted', 'high-risk', 'high-street', 'large-scale', 'next-door', 'neo-Nazis'}
2457 {'air-gun', 'neo-Nazi', '20-year', 'post-communist', 'state-building', 'hate-based', 'question-giving', 'cat-and', 'game-changer', '15- to', 'in-group'}
2458 {'smog-free', 'voice-recognition', 'five-year', 'glow-in', 'light-emitting', 'the-dark', 'nice-to', 'hyper-technological', 'clean-air', 'problem-solving', 'creative-destructive', 'Smog-free', 'one-off', 'short-term'}
2459 {'D- for', 'behind-the', 'open-mindedness', 'decision-making', 'idea-meritocratic'}
2460 {'Saint-Rémy', 'de-Provence', 'sun-dappled', 'self-portrait', 'self-similar', 'Saint-Paul', 'de-Mausole'}
2461 {'torque-generating'}
2462 {'decision-making', 'self-driving'}
2463 {'17-year', 'sleep-inducing'

2564 {'water-drip', 'T-shirt', 'modern-day', 'water-dripping', 'water-dropping', 'X-ray'}
2565 {'single-celled', 'well-being', 'no-brainer', 'life-and', 'undreamed-of', 'tight-knit', 'long-term', 'Costco-sized'}
2566 {'centimeter-level', 'high-quality', 'M-Pesa', 'jet-lagged', 'out-innovate', 'world-changing', 'C-section', '75-kilometer', '60- and', 'start-ups', 'e-commerce', '24-year', 'developing-world'}
2567 {'Ireland - and', 'bar - not', 'DUP-voting', 'me - because', 'Phelps-Roper', 'neo-Nazis', 'gay - don', 'tops - and', 'Londonderry-Derry', 'Free-P', 'UK - I', 'one - with'}
2568 {'start-ups', 'chemical-free'}
2569 {'yarn-making', 'fuel-based', 'petroleum-based', 'spider-silk', 't-shirt', 'bio-based', 'high-performance'}
2570 {'single-domain', 'self-driving', 'world-famous', 'self-checking', 'domain-specific', 'board-certified', '60-100', 'mediocre-ish', 'mediocre-to', 'low-end', 'X-rays', 'spin-off', 'look-ahead', 'half-moons', 'blue-collar', 'bug-finding', 'the-box', 'de-slaving

2673 {'single-handedly', 'N-word', 'low-wage', 'living-wage', 'fastest-growing', 'Pop-Up', 'non-Native', 'Ninety-five', 'pop-up'}
2674 {'stand-up', 'identity-based', 'first-grader', 'two-way', 'Cookie-cutter', 'male-centered', 'statue-still', 'child-friendly'}
2675 {'evidence-based', 'self-driving', 'one-by', 'animal-free', 'movements - from', 'animal-welfare', 'farmed-animal', 'Forty-seven', 'plant-based', 'few - around', 'animal-based', 'five-pound', 'movements - have', 'Seventy-five', 'clean-meat', 'award-winning', 'so-called', 'So-called', 'well-evidenced', 'non-factory'}
2676 {'non-routine', 'computer-assisted', 'rules-based', 'well-defined', 'routine-nonroutine'}
2678 {'pocket-size', 'cross-disciplinary', '29-year'}
2679 {'Nicolas-Jacques', 'KOH-I', '14-carat'}
2680 {'percent-done', 'pre-computer'}
2681 {'TICK-tat', 'girl-power', 'Filet-O', 'Award-winning', 'tick-TAT', 'Tick-TAT', 'ready-made', 'hip-hop'}
2682 {'17-year', 'B-boys', 'super-comfortable', 'T-shirt', 'African-America

2795 {'two-thirds', 'long-distance', 'deep-sea', 'high-standing'}
2796 {'pro-Russia'}
2797 {'ill-tempered', 'laugh - we', 'eight-o', 're-presenting', 'Russian-American', 'why - why', 'you - BY', 'de-escalation', 'always - absolutely', 'day-one', 'peek-a', 'like - actually', 'de-escalating', 'somebody - we', 'callous-unemotional', 'Yeltsin - dear'}
2798 {'long-term', 'slow-wave', 'short-term'}
2799 {'goal-setting', 'three-year', 'bet-your', 'anti-HIV', 'time-bound', 'world-changing', 'high-quality', 'Action-oriented', 'action-oriented'}
2800 {'mid-flight', '360-degree'}
2801 {'sweet-tasting', 'age-old', 'bright-colored'}
2802 {'mind-reading', 'Mind-reading', 'sci-fi', 'large-scale', 'real-time'}
2803 {'spot-on'}
2804 {'curiosity-based', 'two-thirds', 'two-spot', 'razor-sharp', 'well-aerated', 'strange-looking'}
2805 {'non-whole', 'Thirty-six', 'Twenty-five', 'problem-solving', 'spy-craft'}
2806 {'plan - they'}
2807 {'ever-present', 'decades-long', 'wabi-sabi', 'all-consuming', 'slow-mov

2921 {'civilian-centered', 'long-term', 'short-term', 'never-ending'}
2922 {'castle-bound'}
2923 {'though - in', 'raw - and', 'Watergate - the', 'cover-up', 'happened - I', 'script - it', 'surveillance - everything', 'them - we', 'anti-Vietnam', 'son-in', 'also - they', 'top-secret', 'anti-war', 'nominating - in', 'decided - and', 'dog - and', 'resilient - I', 'knowledge - who', 'break-ins', 'then - because', 'operation - and', 'tapes - and', 'Watergate - which', 'tapes - what', 'is - the', 'plan - Huston', 'Democrats - and', 'tapes - here', 'burglars - I', 'ago - what', 'surveillance - that', '1972 - of', 'chilling - President', 'listening - and'}
2924 {'insecticide-treated', 'a-thousand', 'worst-off', 'one-in', 'long-lasting', 'man-made', 'hair-trigger', 'Human-Compatible'}
2925 {'two-room', 'Chicago-based', 'souped-up'}
2926 {'human-transplantable', 'gene-editing', 'cutting-edge', 'geno-modified', 'virus-free', 'human-immune'}
2927 {'400-word', 'al-Dunya', 'world-renowned', 'two-way

3051 {'the-art', 'five-man', 'hand-picked', 'state-of', 'motor-sledges'}
3052 {'Earth-like', 'so-called'}
3053 {'screwed-up', 'Ninety-eight', 'Christmasy-look', 'one-to', 'tongue-tied', 'pat-down'}
3054 {'Eighty-seven', 'Dunning-Kruger', 'peer-reviewed', 'ill-posed', '40-second'}
3055 {'near-constant'}
3056 {'single-handedly', 'face-plants', 'four-tiered', 'high-tech', 'slap-stick'}
3057 {'team-building', 'day-to', 'one-hour', 'multimillion-dollar', 'mid-level', 'well-meaning', '21st-century'}
3058 {'would-be', 'hard-earned', 'year-end', 'decision-making', 'no-holds'}
3059 {'role-playing', 'zero-sum', 'cage-free', 'sci-fi', 'anti-human', 'scenario-planners', 'winner-takes'}
3060 {'spino-cerebellar', 'spot-on', 'dopamine-containing', 'cauliflower-shaped'}
3061 {'city-states'}
3062 {'first-ever', 'two-dimensional', 'Galloway-Gallego', 'sign-language', 'hard-of'}
3063 {'panda-shaped', 'sunny-day', 'two-thirds', 'pay-as', 'doom-filled', 'three-quarters', 'fastest-growing', 'sub-Saharan', '

3222 {'chest-bursting', 'ring-like', 'time-lapse'}
3223 {'state-of', 'the-art', 'U-Haul', 'sixth-grade'}
3224 {'long-necked', 'ill-suited', 'well-suited', 'self-preservation', 'Jean-Baptiste'}
3225 {'Non-Newtonian', 'non-linearly'}
3226 {'non-green', 'green-eyed'}
3227 {'snow-covered', 'mid-ocean', 'silt-covered'}
3228 {'ten-day', 'Hindu-Arabic', 'decimal-based'}
3229 {'human-cannonball', 'well-known', 'teeter-totter', 'two-sided', 'land-art', 'counter-research', 'US-Mexico'}
3230 {'re-examining', 'ever-changing', 'near-future'}
3232 {'20-something', 'T-shirt', 'eye-for', '16-year', 'an-eye', '14-year', 'law-abiding'}
3233 {'self-assemble', 'self-assembled', 'Self-assembly', 'ever-increasing', 'light-sensitive', 'co-polymer', '40-nanometer', 'self-assembly', '120-nanometer', 'self-assembling', 'cost-effective', 'long-term', 'built-in'}
3234 {'60-year', 'French-speaking', 'second-coolest'}
3235 {'Jean-Michael', 'Jean-Michel', 'cut-up', 'value- but', 'avant-garde', 'drawings- along', 'st

3425 {'wireless-to', 'power-efficient', 'short-range', 'long-range', 'souped-up', 'long-distance', 'overly-ambitious', 'high-def', 'electricity-to', 'fifty-kilometer'}
3426 {'then-recent', 'Yog-Sothoth', 'X-rays'}
3427 {'full-fledged', 'high-speed', 'differently-sized'}
3428 {'drop-offs', 'self-sacrifice', '20-year', '60-year', 'work-life', 'red-blooded', 'everybody-gets', 'of-the', 'self-fulfilling', 'Straight-out', 'buy-them', 'X-er', 'pre-tenure', 'a-ribbon', 't-even', '80-year', '23-year', 'one-year'}
3429 {'bed-sheets', 'squirrel-hair', 'wide-ranging'}
3430 {'self-advocacy', 'short- and', '50-page', 'skill-building', 'best-laid', 'long-term', 'check-ins', 'long-standing', '70-foot'}
3431 {'6-year'}
3432 {'six-year', 'follow-up', 'long-term', 'three-course', 'four-year'}
3433 {'difficult-to', 'see-saw', 'day-to'}
3434 {'masculine-of', 'ever-evolving', 'stand-up', 'freeze-frame', 'so-called', 'one-sided', 'talked-about', 'great-great', 'great-grandmother', 'self-esteem', 'know-how'}

3552 {'then-governor', 'Elegant-Grotesk', 'Berlin-based', 'Berthold-Grotesk'}
3553 {'eight-foot', 'pop-up', 'paper-mache'}
3555 {'washing-the', 'decision-making', 'middle-aged', '13-year'}
3556 {'performance-wise', 'scale-free', 'highest-impact', 'best-seller', 'high-impact'}
3557 {'two-thirds', 'high-level', 'one-third', 'fast-forward', 'third-largest', 'best-case', 'non-voting', 'one-fifth'}
3558 {'ill-fated', 'best-known', 'last-ditch', 'mid-20th'}
3559 {'laissez-faire', 'watered-down'}
3560 {'well-paid', 'well-being', 'world-renowned'}
3561 {'physiologically-based', 'Well-trained', 'law-enforcement', 'five-month', 'well-qualified', 'work-related', 'problem-solving', 'gender-disparate'}
3562 {'diamond-shape', 'sharkskin-like', 'bacteria-proof', 'non-seafaring', 'life-saving', 'open-heart'}
3563 {'impacts - Ransom', 're-evaluate', 'first-hand', 'Bascompete - effects', 'dark - Okay', 'disclaimer - I', 'long-lining', 'well-known', 'conservation-based', '3-6', 'anti-Jaws', 'oceans-relia

3668 {'right-hand', 'left-hand', 'self-diagnose', 'malaria-associated'}
3669 {'mind-bending'}
3670 {'decision-makers', 'plus-size', 'different-sized', 'able-bodied', 'body-positive', 'Different-size'}
3671 {'mind-blowing', 'traditional-style', 'super-competent', 'self-driving', 'all-knowing', 'age-old', 'present-day', 'résumé-sorting', 'step-by'}
3672 {'mid- 90s', 'Asian-American', 'all-American', 'somewhere-out', 'all-Asian', 'All-Asian', 'high-fived'}
3673 {'data-driven', 'three-tiered', 'best- and', 'high-quality', 'worst-performing', 'daily-wage', 'Rajasthan-specific', 'stock-outs', 'up-skilled', 'one-year'}
3674 {'clear-cut', 'down-ranked', 'co-opted', 'fact-checking', 'start-up', 'fact-checkers', 'well-being', 'well-meaning', 'long-term', 'content-moderation'}
3675 {'red-hot', 'hard-won'}
3676 {'win-win', 'day-to', '19-year'}
3677 {'art-induced', 'da-da'}
3678 {'unreadable - gigabytes', 'health-related', 'microbes - a', 'so-called', 'four-letter'}
3679 {'three-fifths', '18th-cent

3779 {'FDA-approved', 'Fast-forward', 'laser-perforated', 'mid-century', 'so-called', 'sixty-three', 'first-ever', 'third-largest', '10-second', 'smoking-related', 'e-cigarettes', 'health-concerned', 'cigarette-related', 'drug-delivery', 'nicotine-reduction', 'long-term', 'population-level'}
3780 {'modern-day', 'second-degree', 'majority-white', 'long-standing'}
3781 {'high-energy', 'one-eighth', 'gamma-ray'}
3782 {'energy-generating'}
3783 {'high-starch', 'multidrug-resistant', 'drinking-water', 'antibiotic-free', 'low-fiber', 'middle- and', 'lower-protein', 'antibiotic-use', 'well-balanced', 'antibiotic-containing', 'three-pronged', 'antibiotic-reduced', 'antibiotic-resistant', 'high-income', 'higher-fiber', 'large-scale', 'better-developed'}
3784 {'bag-like', 'blood-gas', 'low-oxygen'}
3785 {'co-conspirators', 'often-unheard', 'so-called', 'life-threatening', 'point-blank'}
3786 {'X-Road', 'e-Estonia', 'location-independent', 'start-up', 'user-centric', 'e-Residency', 'e-Cabinet', '

3934 {'forty-nine', 'the-country', 'co-ops', 'Six-thousand', 'day-to', 'non-farmers', 'thirty-two', 'year-round', 'great-grandfather', 'renewable-energy', 'pick-your', 'co-op', 'all-of', 'first-in'}
3935 {'so-called', 'long-awaited', 'non-Christian'}
3936 {'single-handedly', 'climate-smart', 'well-being', 'sub-Saharan', 'kick-start'}
3937 {'cutting-edge', 'tune-up', 'second-order'}
3938 {'100-year', 'every-person', 'Martínez-Patiño', 'high-security', 'well-known', 'topsy-turvy', 'fast-forward'}
3939 {'multi-storied', 'city-states', 'cross-dressers', 'Naram-Sin', 'wedge-shaped'}
3940 {'fast-running', 'trillion-frame', 'self-driving', 'high-speed', 'low-power', 'of-sight', 'home-security', 'room-sized', 'ultra-high', 'real-time', 'per-second', 'non-line', 'eye-safe'}
3941 {'flood-prone', 'multi-pronged', 'the-art', '680-tonne', 'low-lying', '240-meter', 'state-of'}
3942 {'contact-trace', 'a-century', 'Korea-type', 'open-source', 'data-sharing', 'social-distance', 'self-swab', 'rich-world

In [114]:
tedtalks.transcript[1630]

"I work with children with autism. Specifically, I make technologies to help them communicate. Now, many of the problems that children with autism face, they have a common source, and that source is that they find it difficult to understand abstraction, symbolism. And because of this, they have a lot of difficulty with language. Let me tell you a little bit about why this is. You see that this is a picture of a bowl of soup. All of us can see it. All of us understand this. These are two other pictures of soup, but you can see that these are more abstract These are not quite as concrete. And when you get to language, you see that it becomes a word whose look, the way it looks and the way it sounds, has absolutely nothing to do with what it started with, or what it represents, which is the bowl of soup. So it's essentially a completely abstract, a completely arbitrary representation of something which is in the real world, and this is something that children with autism have an incredibl

In [51]:
tedtalks.transcript.str.count('.')

0       11499
1       17251
2       17673
3       18083
4       16981
        ...  
3982     9695
3983     4810
3984     9418
3985     4010
3986     4022
Name: transcript, Length: 3987, dtype: int64

In [80]:
pd.read_csv('~/data/ted_talks_en.csv').transcript[2173]

"I've been living in rural East Africa for about 10 years, and I want to share a field perspective with you on global poverty. I believe that the greatest failure of the human race is the fact that we've left more than one billion of our members behind. Hungry, extreme poverty: these often seem like gigantic, insurmountable problems, too big to solve. But as a field practitioner, I believe these are actually very solvable problems if we just take the right strategies. Archimedes was an ancient Greek thinker, and he taught us that if we lean on the right levers, we can move the world. In the fight against extreme poverty, I believe there are three powerful levers that we can lean on. This talk is all about those levers, and why they make poverty a winnable fight in our lifetimes. What is extreme poverty? When I first moved to rural East Africa, I stayed overnight with a farm family. They were wonderful people. They invited me into their home. We sang songs together and ate a simple dinn

# Load OpenSubtitles.csv

In [122]:
from datasets import load_dataset
opensub=load_dataset('csv',data_files='/home/nxingyu/data/opensubtitles.csv')

Using custom data configuration default


Downloading and preparing dataset csv/default-31343ecce9569509 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/nxingyu/.cache/huggingface/datasets/csv/default-31343ecce9569509/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset csv downloaded and prepared to /home/nxingyu/.cache/huggingface/datasets/csv/default-31343ecce9569509/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2. Subsequent calls will reuse this data.


In [161]:
opensub=pd.read_csv('/home/nxingyu/data/open_subtitles_processed.csv',dtype='string')

In [176]:
opensub[opensub.talk_id.map(lambda x:x.split('/')[4]=='1990')].transcript.apply(removespeakertags)

20721    Alright Suzi are you ready? Just a little bit ...
20722    Don't worry. You don't trust me? I got the mon...
20723    The details. Terrifying. I think malibu barbie...
20724    - Good morning - Good morning, Uncle Wong. Dra...
20725     Brakes Hissing Woman Narrating Her name was O...
                               ...                        
21480    VIGOUR You keep driving but we're not getting ...
21481    WOMAN: Shall I tell you my dream? I dreamt I w...
21482    My name is Lam Pik Seng, I'm 1 7 years old I w...
21483    Aah!. which was at first thought to be the sit...
21484    Get your money ready please. That's 143 rouble...
Name: transcript, Length: 764, dtype: object

In [180]:
tedtalks.transcript.map(lambda x:x[-4:])

0       it. 
1       ch. 
2       ar! 
3       os! 
4       ch. 
        ... 
3982    .org
3983    ore.
3984    ou. 
3985    il. 
3986    utt.
Name: transcript, Length: 3987, dtype: object

In [211]:
def punct_proportion(df):
    l=[]
    for c in ".?!,;:-—":
        l.append(sum(df.transcript.str.count("\\"+c)))
    [print(i[0],i[1],i[1]/sum(l)) for i in zip(list(".?!,;:-—"),l)]
punct_proportion(tedtalks)

. 410157 0.3962796818222661
? 36774 0.035529782545054726
! 3277 0.003166125452769466
, 494575 0.477841469576887
; 5846 0.005648205491879859
: 12950 0.012511847608594626
- 38065 0.036777102642560185
— 33375 0.03224578485998808


In [203]:
tedtalks = pd.read_csv('~/data/ted_talks_processed.csv')[['talk_id','transcript']].sample(frac=1,random_state=42).reset_index(drop=True)
split_1=int(0.8 * len(tedtalks))
split_2=int(0.9 * len(tedtalks))
tedtalks_train = tedtalks[:split_1]
tedtalks_dev = tedtalks[split_1:split_2]
tedtalks_test = tedtalks[split_2:]


In [212]:
for split in tedtalks_train, tedtalks_dev, tedtalks_test:
    punct_proportion(split)
    print()

. 328524 0.3951103702581199
? 29405 0.03536490617866584
! 2698 0.0032448398867553287
, 397931 0.47858501889415667
; 4770 0.005736799948044076
: 10394 0.012500691542970677
- 30778 0.03701619052429781
— 26974 0.03244118276698971

. 42582 0.40382755154296984
? 3825 0.03627449120877037
! 278 0.002636420537526317
, 49507 0.46950097680329267
; 602 0.0057090833222692185
: 1342 0.01272689338618819
- 3813 0.036160688883409516
— 3497 0.03316389431557385

. 39051 0.39807745236954506
? 3544 0.03612676989571759
! 301 0.0030683289330166465
, 47137 0.48050438842393906
; 474 0.004831853535713922
: 1214 0.012375253570372786
- 3474 0.03541320502757418
— 2904 0.029602748244120736



In [216]:
# import os
paths=os.path.splitext('test.csv')
paths[0]+'.test'+paths[1]

'test.test.csv'