In [59]:
import re
import string
from nltk.stem.porter import *
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

porterstemmer = PorterStemmer()

def debate_results():
    file = open('debate.txt','r')
    debate_corpus = file.read()
    text = ' '.join(debate_corpus.splitlines())
    
    #stripping text of crosstalk and audience background noise
    without_crosstalk = text.replace('(CROSSTALK)','')
    refined_statement = without_crosstalk.replace('(inaudible)','')
    
    #regex patterns for extracting conversations of individual speakers
    lehrer_re = re.compile('(?<=LEHRER:).*?((?=OBAMA:)|(?=ROMNEY:)|(?=LEHRER:))')
    obama_re = re.compile('(?<=OBAMA:).*?((?=OBAMA:)|(?=ROMNEY:)|(?=LEHRER:))')
    romney_re = re.compile('(?<=ROMNEY:).*?((?=OBAMA:)|(?=ROMNEY:)|(?=LEHRER:))')
        
    lehrer_text = re.finditer(lehrer_re, refined_statement)    
    stem_output(lehrer_text, 'LEHRER')
    
    obama_text = re.finditer(obama_re, refined_statement)
    stem_output(obama_text, 'OBAMA')
    
    romney_text = re.finditer(romney_re, refined_statement)
    stem_output(romney_text, 'ROMNEY')
    
def stem_output(text, speaker):
    
    #code for removing punctuation, capitalization and stop words
    statement = ''
    for t in text:
        statement += t.group()    
    keys = str.maketrans({key: None for key in string.punctuation})
    out = statement.translate(keys) 
    lower_output = out.lower()
    stop_words = set(stopwords.words('english'))
    
    #tokenizing the text
    tokens = word_tokenize(lower_output)
    refined_sentence = [t for t in tokens if t not in stop_words]
    
    print('\nOutput for speaker '+ speaker +':')
    
    #Porter Stemmer
    porter_stemmed  = [porterstemmer.stem(s) for s in refined_sentence]
    output_frequency(porter_stemmed, 'Porter Stemmer')
    
    #Snowball Stemmer
    snowball_stemmed = [SnowballStemmer("english").stem(ps) for ps in refined_sentence]
    output_frequency(snowball_stemmed, 'Snowball Stemmer')
    
    #Lancaster Stemmer
    lancaster_stemmed = [LancasterStemmer().stem(ss) for ss in refined_sentence]
    output_frequency(lancaster_stemmed, 'Lancaster Stemmer')
    
    positive_word_frequency(porter_stemmed)
    
    
def output_frequency(text, stemmer_used):
    
    #Code for calculating frequency of words
    counter = Counter(text)
    print('\n' + stemmer_used + '(10 most frequent words):')
    print(counter.most_common(10))
    
def positive_word_frequency(text):
    
    #Logic of extracting common words from speech and 
    #positive dictionary and calculating their frequency
    
    file = open('positive.txt','r')
    dictionary = file.read()
    
    stemmed_dictionary = porterstemmer.stem(dictionary)
    counter = collections.Counter(text) 
    common = set(text).intersection( set(stemmed_dictionary.split('\n')) )
    positive_counter = Counter()
    
    for c in common:
        positive_counter[c] = counter[c]
        
    print('\n10 most frequent positive words: ')
    print(positive_counter.most_common(10))
     
if __name__ == '__main__':
    debate_results()


Output for speaker LEHRER:

Porter Stemmer(10 most frequent words):
[('governor', 23), ('two', 23), ('minut', 22), ('presid', 21), ('go', 21), ('right', 20), ('let', 18), ('mr', 15), ('first', 14), ('govern', 14)]

Snowball Stemmer(10 most frequent words):
[('governor', 23), ('two', 23), ('minut', 22), ('presid', 21), ('go', 21), ('right', 20), ('let', 18), ('mr', 15), ('first', 14), ('govern', 14)]

Lancaster Stemmer(10 most frequent words):
[('govern', 37), ('presid', 25), ('two', 23), ('minut', 22), ('right', 20), ('let', 18), ('mr', 15), ('first', 14), ('go', 13), ('seg', 12)]

10 most frequent positive words: 
[('right', 20), ('well', 7), ('support', 4), ('candid', 3), ('great', 1), ('even', 1), ('open', 1), ('good', 1), ('cheer', 1)]

Output for speaker OBAMA:

Porter Stemmer(10 most frequent words):
[('make', 57), ('governor', 50), ('go', 49), ('romney', 44), ('weve', 35), ('got', 34), ('tax', 34), ('that', 34), ('say', 30), ('know', 29)]

Snowball Stemmer(10 most frequent word

In [49]:
#Regular expression for extracting email pairs of given input text 
str = """

        ... austen-emma.txt:hart@vmd.cso.uiuc.edu (internet) hart@uiucvmd (bitnet)

       ... austen-emma.txt:Internet (72600.2026@compuserve.com); TEL: (212-254-5093) .

      .. austen-persuasion.txt:Editing by Martin Ward (Martin.Ward@uk.ac.durham)

      ... blake-songs.txt:Prepared by David Price, email ccx074@coventry.ac.uk... """

output = re.findall(r'[\w.]+@[\w.]+\b', str, flags=0)

print(output)

['hart@vmd.cso.uiuc.edu', 'hart@uiucvmd', '72600.2026@compuserve.com', 'Martin.Ward@uk.ac.durham', 'ccx074@coventry.ac.uk']


What differences do you observe between the different stemmer outputs?

Even though Snowball stemmer is considered as an improvement over Porter stemmer, at least for the above output both stemmers give identical results, whereas Lancaster stemmer has a few noticable differences. For example, in Lehrer's speech results we can see that Porter and Snowball have stemmed a word as 'governor'. But Lancaster has aggressively stemmed it even further to 'govern'. This has increased the number of matches since Lancaster's output also included words like 'government' and not just 'governor'. Similarly, words like 'president' and 'presidential' both got stemmed to 'presid' by Lancaster Stemmer (unlike Porter or Snowball). Hence, Lancaster stemmer is more aggressive in its stemming and produces more matches compared to other two stemmers.