In [1]:
import pandas

In [2]:
df = pandas.read_csv('transcripts.csv')

In [3]:
df.head()

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...


In [4]:
df.dtypes

transcript    object
url           object
dtype: object

In [5]:
ted = pandas.read_csv('ted_main.csv')

## 1. Text Preprocessing

In [6]:
import string

In [7]:
import nltk

### Step 1: Tokenization

In [8]:
# take test function for the first record
sample_word_tokens = nltk.wordpunct_tokenize(df.transcript[0])
print(sample_word_tokens)

['Good', 'morning', '.', 'How', 'are', 'you', '?(', 'Laughter', ')', 'It', "'", 's', 'been', 'great', ',', 'hasn', "'", 't', 'it', '?', 'I', "'", 've', 'been', 'blown', 'away', 'by', 'the', 'whole', 'thing', '.', 'In', 'fact', ',', 'I', "'", 'm', 'leaving', '.(', 'Laughter', ')', 'There', 'have', 'been', 'three', 'themes', 'running', 'through', 'the', 'conference', 'which', 'are', 'relevant', 'to', 'what', 'I', 'want', 'to', 'talk', 'about', '.', 'One', 'is', 'the', 'extraordinary', 'evidence', 'of', 'human', 'creativity', 'in', 'all', 'of', 'the', 'presentations', 'that', 'we', "'", 've', 'had', 'and', 'in', 'all', 'of', 'the', 'people', 'here', '.', 'Just', 'the', 'variety', 'of', 'it', 'and', 'the', 'range', 'of', 'it', '.', 'The', 'second', 'is', 'that', 'it', "'", 's', 'put', 'us', 'in', 'a', 'place', 'where', 'we', 'have', 'no', 'idea', 'what', "'", 's', 'going', 'to', 'happen', ',', 'in', 'terms', 'of', 'the', 'future', '.', 'No', 'idea', 'how', 'this', 'may', 'play', 'out', '.'

In [9]:
# apply to the entire datasets
df['word_tokens'] = df.transcript.apply(lambda x: nltk.word_tokenize(x))

### Step 2: Normalization

* Normalization generally refers to a series of related tasks meant to put all text on a level playing field: converting all text to the same case (upper or lower), removing punctuation, converting numbers to their word equivalents, and so on.

In [10]:
import unicodedata

In [11]:
import re

In [12]:
stopwords = nltk.corpus.stopwords.words('english')

In [13]:
punct = set(string.punctuation)

In [14]:
# Create function to normalize words from list of tokenized words"""
def normalize(list_of_tokens):
    new_word_list = []
    for word in list_of_tokens:
        
        # remove non-ASCII characters from list
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        
        # Convert all characters to lowercase 
        new_word = new_word.lower() 
        
        new_word = new_word.strip('—')
        
        # TODO:
        # Replace all interger occurrences with textual representation
        #if word.isdigit():
        
        # If stopword, ignore token and continue
        if new_word in stopwords:
            continue
            
        # If punctuation, ignore token and continue
        if all(char in punct for char in new_word):
            continue
            
        new_word_list.append(new_word)
        
    return new_word_list

In [16]:
normalize(sample_word_tokens)

['good',
 'morning',
 'laughter',
 'great',
 'blown',
 'away',
 'whole',
 'thing',
 'fact',
 'leaving',
 'laughter',
 'three',
 'themes',
 'running',
 'conference',
 'relevant',
 'want',
 'talk',
 'one',
 'extraordinary',
 'evidence',
 'human',
 'creativity',
 'presentations',
 'people',
 'variety',
 'range',
 'second',
 'put',
 'us',
 'place',
 'idea',
 'going',
 'happen',
 'terms',
 'future',
 'idea',
 'may',
 'play',
 'interest',
 'education',
 'actually',
 'find',
 'everybody',
 'interest',
 'education',
 'find',
 'interesting',
 'dinner',
 'party',
 'say',
 'work',
 'education',
 'actually',
 'often',
 'dinner',
 'parties',
 'frankly',
 'laughter',
 'work',
 'education',
 'asked',
 'laughter',
 'never',
 'asked',
 'back',
 'curiously',
 'strange',
 'say',
 'somebody',
 'know',
 'say',
 'say',
 'work',
 'education',
 'see',
 'blood',
 'run',
 'face',
 'like',
 'oh',
 'god',
 'know',
 'laughter',
 'one',
 'night',
 'week',
 'laughter',
 'ask',
 'education',
 'pin',
 'wall',
 'one',
