## Imports

In [1]:
import numpy as np
import nltk, io, os
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## read data into dataframe

In [2]:
datapath = './Speeches/'
df_raw = pd.DataFrame([],columns=['Speech','Speaker','Text'])

for root, directories, files in os.walk(datapath):
    for filename in files:
        speech = os.path.splitext(filename)[0] # to get rid of .txt
        speech = speech[7:] # to remove Speech-
        filepath = os.path.join(root, filename)
        f = open(filepath,"r",encoding='latin') 
        content = f.read()
        f.close()
        content = content.split("\n\n",1) # max split = 1
        speaker = content[0]
        text = content[1:]
        text = ''.join(text) # list to string

        pieces = {'Speech': speech,'Speaker':speaker, 'Text':text}
        df_raw = df_raw.append(pieces,ignore_index=True)
print("Read %d raw speeches" % len(df_raw))

df_raw.head()

Read 381 raw speeches


Unnamed: 0,Speech,Speaker,Text
0,1960-06-29_8-102,The Taoiseach,The Minister's focus is on improving the healt...
1,1960-06-29_8-116,Mr. T.F. O'Higgins,That must be a very infrequent occurrence.\n
2,1971-06-09_23-2,Parliamentary Secretary to the Minister for th...,Is Ã© an cuspÃ³ir atÃ¡ ag an Rialtas nÃ¡ lÃ¡nf...
3,2015-01-27_20-105,Deputy Peter Mathews,Why do we not remember them here?\n
4,1971-06-09_19-17,The Taoiseach,"Yesterday's ceremony was a moving, fitting and..."


## Pre-processing

### Tokenize Content column

In [3]:
wordExp = r'\w+' # this pattern finds all the words
tokenizer = RegexpTokenizer(wordExp)

tokensList = [] # this is a list of lists. Each list contains the tokens of a document.

for content in df_raw['Text']:
    token_words = tokenizer.tokenize(content) #tokenize all words in the document
    tokensList.append(token_words) #add this list to tokensList

### Decapitalisation

In [4]:
def decapitalise(list_of_tokenLists):
    newTokenList = [] #this will store the new list
    
    for tokens in list_of_tokenLists: #for each list in the big list
        
        decapitalised = [] #this will store a list of decapitalised tokens from a single doc
        for word in tokens: #for each word in the list
            if not word.isupper(): #don't decapitalise if the whole word is in uppercase anyway - avoids decapitalising acronyms like US, LA, ID 
                decapitalised.append(word.lower()) # change to lower case
            else:
                decapitalised.append(word)
        newTokenList.append(decapitalised) #add decapitalised list to overall list
        
    return newTokenList

### Remove stopwords

In [6]:
def remove_stopwords(list_of_tokenLists):
    stopword_list = stopwords.words('english') #import default nltk stopwords
    newTokenList = [] #this will store the new list
    
    for tokens in list_of_tokenLists: #for each list in the big list
        notStop = [] #this will store a list of non-stopword tokens from a single doc
        for word in tokens: #for each word in the list
            if word not in stopword_list: #if word is not a stopword, append it
                notStop.append(word)
        newTokenList.append(notStop) #append newlist to the overall list
        
    return newTokenList

### Remove numbers
* This removes numbers from the token list  
* Does not remove words that contain numbers (e.g): 70th

In [7]:
def remove_numbers(list_of_tokenLists):
    newTokenList = [] #this will store the new list
    
    for tokens in list_of_tokenLists: #for each list in the big list
        
        noNumbers= [] #this will store a list of tokens from a single doc
        for word in tokens: #for each word in the list#
            if not word.isdigit():
                noNumbers.append(word) # remove number
        newTokenList.append(noNumbers)
        
    return newTokenList

### Remove punctuation

In [8]:
def punctuation(list_of_tokenLists):
    newTokenList = [] #this will store the new list
    
    for tokens in list_of_tokenLists: #for each list in the big list
        
        depunctuated = [] #this will store a list of tokens from a single doc
        for word in tokens: #for each word in the list
            depunctuated.append(word.translate(string.punctuation)) # remove punctuation
        newTokenList.append(depunctuated)
    return newTokenList

### Expand Contractions
* Removing punctuation needs to be applied before expanding contractions. Otherwise contractions with hyphens and apostrophes won't be found.
* Contractions are shortened version of words or syllables. 
* Converting each contraction to its expanded, original form helps with text standardization.

In [9]:
def expandContractions(inputList):
    contractions = { 
    "ain't": "am not / are not / is not / has not / have not",
    "aren't": "are not / am not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how has / how is / how does",
    "I'd": "I had / I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it shall / it will",
    "it'll've": "it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as / so is",
    "that'd": "that would / that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": " what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who has / who is",
    "who've": "who have",
    "why's": "why has / why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had / you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
    }
    #for every word in the input text
    for list_of_tokens in inputList:
        for word in list_of_tokens:
            # if the word is in our contractions dictionary replace it with the expanded version.
            if (word.lower() in contractions):
                inputList = inputList.replace(word, contractions[word.lower()])
            #if the word contains a hyphen, replace the hyphen with a space leaving two words
            if ("-" in word):
                inputList = inputList.replace(word, word.replace("-", " "))
    return (inputList)

## Lemmatization

In [10]:
def lemmatize(list_of_tokenLists):
    newTokenList = [] #this will store the new list
    wordnet_lemmatizer = WordNetLemmatizer()
    
    for tokens in list_of_tokenLists:
        tempLemmatized = []
        for word in tokens:
            tempLemmatized.append(wordnet_lemmatizer.lemmatize(word))
        newTokenList.append(tempLemmatized)
    return newTokenList

### Stemming

In [11]:
def stemming(list_of_tokenLists):
    newTokenList = [] #this will store the new list
    stemmer = PorterStemmer()
    
    for tokens in list_of_tokenLists:
        tempStemmed = []
        for word in tokens:
            tempStemmed.append(stemmer.stem(word))
        newTokenList.append(tempStemmed)
    return newTokenList

## Apply above functions to create "cleaned" tokens

In [12]:
def clean(list_of_tokenLists):
    list_of_tokenLists = decapitalise(list_of_tokenLists) #decapitalise
    list_of_tokenLists = remove_stopwords(list_of_tokenLists) #remove stopwords
    list_of_tokenLists = remove_numbers(list_of_tokenLists) #remove numbers
    list_of_tokenLists = punctuation(list_of_tokenLists) #remove punctuation
    list_of_tokenLists = expandContractions(list_of_tokenLists) #expand contractions
    list_of_tokenLists = stemming(list_of_tokenLists) # stemming
    return list_of_tokenLists

In [13]:
print("Tokens Before: ")
print(tokensList[0][:50])
print("------------")

tokensList = clean(tokensList) # clean
 
print("Tokens After: ")
print(tokensList[0][:50])

Tokens Before: 
['The', 'Minister', 's', 'focus', 'is', 'on', 'improving', 'the', 'health', 'services', 'In', 'respect', 'of', 'Deputy', 'Mathews', 'I', 'might', 'say', 'that', 'the', 'President', 'of', 'Ireland', 'Ã', 'achtarÃ', 'n', 'na', 'hÃ', 'ireann', 'represented', 'all', 'our', 'people', 'in', 'the', 'Mansion', 'House', 'on', 'Sunday', 'He', 'represented', 'everybody']
------------
Tokens After: 
['minist', 'focu', 'improv', 'health', 'servic', 'respect', 'deputi', 'mathew', 'I', 'might', 'say', 'presid', 'ireland', 'Ã', 'achtarã', 'n', 'na', 'hã', 'ireann', 'repres', 'peopl', 'mansion', 'hous', 'sunday', 'repres', 'everybodi']


In [14]:
df_raw['Tokens'] = tokensList
df_raw.head()

Unnamed: 0,Speech,Speaker,Text,Tokens
0,1960-06-29_8-102,The Taoiseach,The Minister's focus is on improving the healt...,"[minist, focu, improv, health, servic, respect..."
1,1960-06-29_8-116,Mr. T.F. O'Higgins,That must be a very infrequent occurrence.\n,"[must, infrequ, occurr]"
2,1971-06-09_23-2,Parliamentary Secretary to the Minister for th...,Is Ã© an cuspÃ³ir atÃ¡ ag an Rialtas nÃ¡ lÃ¡nf...,"[Ã, cuspã³ir, atã, ag, rialta, nã, lã, nfhosta..."
3,2015-01-27_20-105,Deputy Peter Mathews,Why do we not remember them here?\n,[rememb]
4,1971-06-09_19-17,The Taoiseach,"Yesterday's ceremony was a moving, fitting and...","[yesterday, ceremoni, move, fit, appropri, rec..."


## Join the cleaned tokens

In [15]:
joinedTokensList = []

for i in tokensList:
    joinedTokensList.append(" ".join(i))
df_raw['clean_Text'] = joinedTokensList
new_df = df_raw
new_df.head()

Unnamed: 0,Speech,Speaker,Text,Tokens,clean_Text
0,1960-06-29_8-102,The Taoiseach,The Minister's focus is on improving the healt...,"[minist, focu, improv, health, servic, respect...",minist focu improv health servic respect deput...
1,1960-06-29_8-116,Mr. T.F. O'Higgins,That must be a very infrequent occurrence.\n,"[must, infrequ, occurr]",must infrequ occurr
2,1971-06-09_23-2,Parliamentary Secretary to the Minister for th...,Is Ã© an cuspÃ³ir atÃ¡ ag an Rialtas nÃ¡ lÃ¡nf...,"[Ã, cuspã³ir, atã, ag, rialta, nã, lã, nfhosta...",Ã cuspã³ir atã ag rialta nã lã nfhostaã ocht c...
3,2015-01-27_20-105,Deputy Peter Mathews,Why do we not remember them here?\n,[rememb],rememb
4,1971-06-09_19-17,The Taoiseach,"Yesterday's ceremony was a moving, fitting and...","[yesterday, ceremoni, move, fit, appropri, rec...",yesterday ceremoni move fit appropri recognit ...


## Group by Year, tfidf, NMF

In [16]:
# Create new column to contain the Year
[n,d] = new_df.shape
new_df['Year'] = ['']*n

for index, row in new_df.iterrows():
    new_df['Year'].iloc[index] = (row['Speech'][:4])
    
new_df.head()

Unnamed: 0,Speech,Speaker,Text,Tokens,clean_Text,Year
0,1960-06-29_8-102,The Taoiseach,The Minister's focus is on improving the healt...,"[minist, focu, improv, health, servic, respect...",minist focu improv health servic respect deput...,1960
1,1960-06-29_8-116,Mr. T.F. O'Higgins,That must be a very infrequent occurrence.\n,"[must, infrequ, occurr]",must infrequ occurr,1960
2,1971-06-09_23-2,Parliamentary Secretary to the Minister for th...,Is Ã© an cuspÃ³ir atÃ¡ ag an Rialtas nÃ¡ lÃ¡nf...,"[Ã, cuspã³ir, atã, ag, rialta, nã, lã, nfhosta...",Ã cuspã³ir atã ag rialta nã lã nfhostaã ocht c...,1971
3,2015-01-27_20-105,Deputy Peter Mathews,Why do we not remember them here?\n,[rememb],rememb,2015
4,1971-06-09_19-17,The Taoiseach,"Yesterday's ceremony was a moving, fitting and...","[yesterday, ceremoni, move, fit, appropri, rec...",yesterday ceremoni move fit appropri recognit ...,1971


In [17]:
from sklearn.externals import joblib
groups_by_year = new_df.groupby('Year')
groupsList = groups_by_year.groups.keys()

for group in groupsList: #for each Year
    # get documents
    documents = groups_by_year.get_group(group)['clean_Text']
    
    tfidf_vectorizer = TfidfVectorizer() # declare vectorizer object
    
    # Create sparse NumPy array where the entries are all TF-IDF normalised
    tfidf = tfidf_vectorizer.fit_transform(documents) 
    print( "Created %d X %d TF-IDF-normalized document-term matrix" % (tfidf.shape[0], tfidf.shape[1]) )    
    terms = tfidf_vectorizer.get_feature_names()
    print("Vocabulary has %d distinct terms" % len(terms))
    
    # Save each tfidf
    filepath = "./tfidf/tfidf-{}.txt".format(group)
    print("Checking for %s" % filepath)
    # Check if file has already been downloaded
    if not os.path.exists( filepath ):
        print("Writing %s" % filepath)
        joblib.dump((tfidf,terms), filepath)
print("Process Complete")

Created 126 X 1573 TF-IDF-normalized document-term matrix
Vocabulary has 1573 distinct terms
Checking for ./tfidf/tfidf-1960.txt
Created 142 X 1597 TF-IDF-normalized document-term matrix
Vocabulary has 1597 distinct terms
Checking for ./tfidf/tfidf-1971.txt
Created 112 X 972 TF-IDF-normalized document-term matrix
Vocabulary has 972 distinct terms
Checking for ./tfidf/tfidf-2015.txt


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [18]:
# Function to display the document topics
import numpy as np
def get_topics( terms, H, topic_index, top ):
    # Reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    # Terms for top-ranked indices
    top_terms = []
    for term_index in top_indices[0:top]:
        # Append terms to top_terms
        top_terms.append( terms[term_index] )
    return top_terms

In [19]:
num_topics = 5

In [20]:
import glob
from sklearn import decomposition
allfiles = glob.glob('./tfidf/tfidf-*.txt')
for file in allfiles:
    print("Processing %s" % file)
    from sklearn.externals import joblib
    (tfidf,terms) = joblib.load( file )
    print( "Loaded %d X %d document-term matrix" % (tfidf.shape[0], tfidf.shape[1]) )
    
    nmf_model = decomposition.NMF(n_components=num_topics, init="nndsvd") # use randominitialization
    W = nmf_model.fit_transform( tfidf ) # W = matrix that contains the topics discovered from the documents
    H = nmf_model.components_ # H = coefficient matrix containing the membership weights for the topics in each document
    
    # To display the document topics
    print(file[14:-4])# To print (e.g.): 1960
    topic_words = []
    for topic_index in range(num_topics):
        topic_words.append( get_topics( terms, H, topic_index, 10 ) )
        str_topic_words = ", ".join( topic_words[topic_index] )
        print("Topic %02d: %s" % ( topic_index+1, str_topic_words ) )
    
    # Save topic model
    num=0
    for i in range(num_topics):
        num+=1
        filepath = "./nmf_models/Speech-nmf-model-{}.txt".format(file[14:-4] + "-" + str(num))
        print("Checking for %s" % filepath)
        # Check if file has already been downloaded
        if not os.path.exists( filepath ):
            print("Writing %s" % filepath)
            joblib.dump((W,H,terms), filepath)
print("Process Complete")

Processing ./tfidf/tfidf-1960.txt
Loaded 126 X 1573 document-term matrix
1960
Topic 01: prison, minist, govern, peopl, would, state, hospit, book, governor, water
Topic 02: pleas, order, quiet, deputi, ask, legisl, busi, paper, cheann, lea
Topic 03: bill, taoiseach, could, deal, reform, respond, health, matter, vote, call
Topic 04: debat, tomorrow, inquiri, someth, state, vote, whip, minist, comhairl, ceann
Topic 05: deputi, troy, seat, mathew, resum, sorri, question, floor, rais, must
Checking for ./nmf_models/Speech-nmf-model-1960-1.txt
Checking for ./nmf_models/Speech-nmf-model-1960-2.txt
Checking for ./nmf_models/Speech-nmf-model-1960-3.txt
Checking for ./nmf_models/Speech-nmf-model-1960-4.txt
Checking for ./nmf_models/Speech-nmf-model-1960-5.txt
Processing ./tfidf/tfidf-1971.txt
Loaded 142 X 1597 document-term matrix
1971
Topic 01: bill, taoiseach, could, health, prison, govern, reform, minist, deputi, deal
Topic 02: ghaorthaidh, tha, bhfuil, aon, al, leith, gcã³ir, bhã, faoi, tio