In [1]:
import pandas

In [2]:
df = pandas.read_csv('transcripts.csv')

In [3]:
df.head()

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...


In [4]:
df.dtypes

transcript    object
url           object
dtype: object

In [5]:
ted = pandas.read_csv('ted_main.csv')

## 1. Text Preprocessing

In [6]:
import string

In [7]:
import nltk

### Step 1: Tokenization

* The tokenize method breaks raw strings into words and punctuation

In [8]:
# take test function for the first record
sample_word_tokens = nltk.wordpunct_tokenize(df.transcript[0])
print(sample_word_tokens)

['Good', 'morning', '.', 'How', 'are', 'you', '?(', 'Laughter', ')', 'It', "'", 's', 'been', 'great', ',', 'hasn', "'", 't', 'it', '?', 'I', "'", 've', 'been', 'blown', 'away', 'by', 'the', 'whole', 'thing', '.', 'In', 'fact', ',', 'I', "'", 'm', 'leaving', '.(', 'Laughter', ')', 'There', 'have', 'been', 'three', 'themes', 'running', 'through', 'the', 'conference', 'which', 'are', 'relevant', 'to', 'what', 'I', 'want', 'to', 'talk', 'about', '.', 'One', 'is', 'the', 'extraordinary', 'evidence', 'of', 'human', 'creativity', 'in', 'all', 'of', 'the', 'presentations', 'that', 'we', "'", 've', 'had', 'and', 'in', 'all', 'of', 'the', 'people', 'here', '.', 'Just', 'the', 'variety', 'of', 'it', 'and', 'the', 'range', 'of', 'it', '.', 'The', 'second', 'is', 'that', 'it', "'", 's', 'put', 'us', 'in', 'a', 'place', 'where', 'we', 'have', 'no', 'idea', 'what', "'", 's', 'going', 'to', 'happen', ',', 'in', 'terms', 'of', 'the', 'future', '.', 'No', 'idea', 'how', 'this', 'may', 'play', 'out', '.'

In [19]:
# apply to the entire datasets
df['word_tokens'] = df.transcript.apply(lambda x: nltk.wordpunct_tokenize(x))

### Step 2: Normalization

* Normalization generally refers to a series of related tasks meant to put all text on a level playing field: converting all text to the same case (upper or lower), removing punctuation, converting numbers to their word equivalents, and so on.

* If the token is a stopword or if every character is punctuation, the token is ignored. If it is not ignored, the part of speech is used to lemmatize the token, which is then yielded.

In [10]:
import unicodedata

In [11]:
import re

In [12]:
import inflect

In [13]:
stopwords = nltk.corpus.stopwords.words('english')

In [14]:
punct = set(string.punctuation)

In [15]:
p = inflect.engine()

In [16]:
# Create function to normalize words from list of tokenized words"""
def normalize(list_of_tokens):
    new_word_list = []
    for word in list_of_tokens:
        
        # remove non-ASCII characters from list
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        
        # Convert all characters to lowercase 
        new_word = new_word.lower() 
        
        new_word = new_word.strip('—')
        
        # Replace all interger occurrences with textual representation
        if word.isdigit():
            new_word = p.number_to_words(word)
        
        # If stopword, ignore token and continue
        if new_word in stopwords:
            continue
            
        # If punctuation, ignore token and continue
        if all(char in punct for char in new_word):
            continue
            
        new_word_list.append(new_word)
        
    return new_word_list

In [18]:
print(normalize(sample_word_tokens))

['good', 'morning', 'laughter', 'great', 'blown', 'away', 'whole', 'thing', 'fact', 'leaving', 'laughter', 'three', 'themes', 'running', 'conference', 'relevant', 'want', 'talk', 'one', 'extraordinary', 'evidence', 'human', 'creativity', 'presentations', 'people', 'variety', 'range', 'second', 'put', 'us', 'place', 'idea', 'going', 'happen', 'terms', 'future', 'idea', 'may', 'play', 'interest', 'education', 'actually', 'find', 'everybody', 'interest', 'education', 'find', 'interesting', 'dinner', 'party', 'say', 'work', 'education', 'actually', 'often', 'dinner', 'parties', 'frankly', 'laughter', 'work', 'education', 'asked', 'laughter', 'never', 'asked', 'back', 'curiously', 'strange', 'say', 'somebody', 'know', 'say', 'say', 'work', 'education', 'see', 'blood', 'run', 'face', 'like', 'oh', 'god', 'know', 'laughter', 'one', 'night', 'week', 'laughter', 'ask', 'education', 'pin', 'wall', 'one', 'things', 'goes', 'deep', 'people', 'right', 'like', 'religion', 'money', 'things', 'big', '

In [21]:
df['normalize_tokens'] = df.word_tokens.apply(lambda x: normalize(x))

In [28]:
import collections

In [31]:
collections.Counter(df.normalize_tokens[0]).most_common(30)

[('laughter', 39),
 ('think', 26),
 ('education', 22),
 ('said', 22),
 ('people', 16),
 ('one', 14),
 ('know', 11),
 ('say', 10),
 ('things', 10),
 ('want', 9),
 ('like', 9),
 ('school', 9),
 ('way', 9),
 ('children', 8),
 ('really', 8),
 ('whole', 7),
 ('human', 7),
 ('world', 7),
 ('kids', 7),
 ('went', 7),
 ('come', 7),
 ('wrong', 7),
 ('system', 7),
 ('gillian', 7),
 ('thing', 6),
 ('future', 6),
 ('actually', 6),
 ('work', 6),
 ('never', 6),
 ('get', 6)]

### Step 3: Lemmatization

* Lemmatization is the process of looking up a single word form from the variety of morphologic affixes that can be applied to indicate tense, plurality, gender, etc. First we need to identify the WordNet tag form based on the Penn Treebank tag, which is returned from NLTK’s standard pos_tag function. We simply look to see if the Penn tag starts with ‘N’, ‘V’, ‘R’, or ‘J’ and can correctly identify if its a noun, verb, adverb, or adjective. We then use the new tag to look up the lemma in the lexicon.

In [32]:
# WordNetLemmatizer looks up data from the WordNet lexicon.
lemmatizer = nltk.WordNetLemmatizer()

In [36]:
wn= nltk.corpus.wordnet

In [38]:
# run test function for first record
lemma_list=[]
for token, tag in nltk.pos_tag(df.normalize_tokens[0]):
    tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
    lemma = lemmatizer.lemmatize(token, pos=tag)
    lemma_list.append(lemma)


In [43]:
# check the result
collections.Counter(lemma_list).most_common(30)

[('laughter', 39),
 ('say', 32),
 ('think', 27),
 ('education', 22),
 ('thing', 16),
 ('people', 16),
 ('go', 15),
 ('one', 14),
 ('know', 14),
 ('get', 12),
 ('come', 12),
 ('way', 11),
 ('school', 10),
 ('want', 9),
 ('see', 9),
 ('like', 9),
 ('child', 9),
 ('kid', 9),
 ('system', 9),
 ('talk', 8),
 ('year', 8),
 ('really', 8),
 ('whole', 7),
 ('human', 7),
 ('world', 7),
 ('wrong', 7),
 ('gillian', 7),
 ('future', 6),
 ('actually', 6),
 ('work', 6)]

In [46]:
# wrap-up into function and apply to entire data
def lemmantize_tokens(list_of_tokens):
    lemma_list=[]
    for token, tag in nltk.pos_tag(list_of_tokens):
        tag = {
                'N': wn.NOUN,
                'V': wn.VERB,
                'R': wn.ADV,
                'J': wn.ADJ
            }.get(tag[0], wn.NOUN)
        lemma = lemmatizer.lemmatize(token, pos=tag)
        lemma_list.append(lemma)
    return lemma_list
    

In [47]:
df["lemmatized_tokens"] = df.normalize_tokens.apply(lambda x : lemmantize_tokens(x))