## Week 1 HW


1. This assignment is a group effort.
2. Submission to be uploaded into your group repositories in the folder week1
3. Deadline is 27th of April 5:00 PM.
4. Please follow google's [python styleguide](https://google.github.io/styleguide/pyguide.html) for your code. Pay attention to the guidelines for naming convention, comments and main.
5. Code will be checked for plagiarism. Compelling signs of a duplicated effort will lead to a rejection of submission and will attract a 100\% grade penalty.

Use the template provided as a starting point. Extend the classes as you see fit. Be careful to place new attributes and methods in the approriate class 

### 1

Extend the classes to include the following methods

1. document_term_matrix - which returns a D by V array of frequency counts.
2. tf_idf - returns a D by V array of tf-idf scores
3. dict_rank - returns the top `n` documents based on a given dictionary and represenation of tokens (eg. doc-term matrix or tf-idf matrix)  

Include subroutines as and when necessary


In [1]:
# In Matthew's environment, a harmless exception is thrown from the following import.
# Actually, it seems to be the first import, whatever it is.  Anyway, just ignore it.
try:
    from nltk.tokenize import wordpunct_tokenize
except Exception:
    pass

import numpy as np
import codecs
import nltk
import re
import csv
import json
import sys
import operator
from nltk import PorterStemmer
from math import log
from collections import Counter

In [2]:
class Document():    
    """
    The Doc class represents a class of individual documents
    """    
    def __init__(self, speech_year, speech_pres, speech_text):
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
    
    def friendly_string(self):
        """ 
        Description: generate a friendly string to describe the document
        """
        return "{0} {1} {2}".format(self.year, self.pres, self.text[1:20])
        
    def token_clean(self,length):
        """ 
        Description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """
        self.tokens = np.array([t for t in self.tokens if
                                (t.isalpha() and len(t) > length)])

    def stopword_remove(self, stopwords):
        """
        Description: remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])

    def stem(self):
        """
        Description: stem tokens with Porter Stemmer.
        """
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])

    def term_vector(self, corpus_token_list):
        """
        Description: generate a term-vector for this document.  The result
                     corresponds with a single row of the document-term-matrix
                     of the corpus
        input: corpus_token_list: a list of tokens from the corpus, a subset
                                  of which will be found in this document.
        """
        vector = [None] * len(corpus_token_list)
        counter = Counter(self.tokens)
        for i in range(len(corpus_token_list)):
            count = counter[corpus_token_list[i]]
            vector[i] = count

        return vector

In [3]:
class Corpus():
    """
    The Corpus class represents a document collection.
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the
        class is instantiated.
        """
        # Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data]         
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        # Get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        # Stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        # Create vocabulary
        self.corpus_tokens()
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs.
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        Description: parses a file of stopwords, removes words of length
        'length' and  stems it.
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """        
        with codecs.open(stopword_file, 'r', 'utf-8') as f: raw = f.read()        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
             
    def corpus_tokens(self):
        """
        Description: create a set of all all tokens or in other words a
        vocabulary
        """        
        # Initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
    
    def document_term_matrix(self):
        """
        Description: generate the document-term matrix for the corpus
        """        
        result = []
        for doc in self.docs:
            vector = doc.term_vector(list(self.token_set))
            result.append(vector)        
        
        return result

    def tf_idf(self):
        """
        Description: generate the TF-IDF matrix for this corpus
        """

        # Generate a copy of the document-term matrix to work with in this
        # function and initialize other local variables.
        dt_matrix = self.document_term_matrix()
        tf_matrix = []
        idf_matrix = []
        tf_idf_matrix = []

        # Build a term frequency matrix from the document term matrix.
        # tf(d,v) = { 0 if x(d,v) = 0, 1 + log(x(d), v) otherwise }
        for dt_doc_vector in dt_matrix:
            tf_doc_vector = [(0 if x == 0 else 1 + log(x)) for x in dt_doc_vector]
            tf_matrix.append(tf_doc_vector)

        # Build a document frequency matrix for each term.
        # Initialize with zeros.
        df_vector = np.zeros(len(self.token_set))
        for dt_doc_vector in dt_matrix:
            # Increment the counters based on an indicator function which
            # is 1 if there is at least one instance of the term in the doc.
            df_vector = np.add(df_vector, [int(x > 0) for x in dt_doc_vector])

        # Build an inverse document frequency vector.
        idf_doc_vector = [log(len(self.docs) / x) for x in df_vector]

        # Build the TF-IDF weighting matrix.
        for tf_doc_vector in tf_matrix:
            tf_idf_vector = np.multiply(tf_doc_vector, idf_doc_vector)
            tf_idf_matrix.append(tf_idf_vector)

        return tf_idf_matrix

    def dict_rank(self, dictionary, use_tf_idf, n):        
        """
        Description: rank the documents in this corpus against the provided
        dictionary.  Return the top n documents.
        input: dictionary: the dictionary against which to rank the documents
               use_tf_idf: True if the TF-IDF matrix is to be used; False if
                           the document-term matrix is to be used.
               n: the number of top-ranked documents to return
        """
        if (use_tf_idf):
            dtm = self.tf_idf()
        else:
            dtm = self.document_term_matrix()
            
        # Get rid of words in the document term matrix not in the dictionary
        dict_tokens_set = set(item for item in dictionary)
        intersection = dict_tokens_set & self.token_set
        vec_positions = [int(token in intersection) for token in self.token_set] 

        # Get the score of each document
        sums = np.zeros(len(dtm))
        for j in range(len(dtm)):
            sums[j] = sum([a * b for a, b in zip(dtm[j], vec_positions)])

        # Order them and return the n top documents
        order = sorted(range(len(sums)), key = lambda k: sums[k], reverse=True)
        ordered_doc_data_n = [None] * len(dtm)
        ordered_sums = np.zeros(len(dtm))

        counter = 0        
        for num in order:
            ordered_doc_data_n[counter] = self.docs[num]
            ordered_sums[counter] = sums[num]
            counter += 1

        return zip(ordered_doc_data_n[0:n], ordered_sums[0:n])

In [4]:
def parse_text(textraw, regex):
    """
    Takes raw string and performs two operations:
      1. Breaks text into a list of speech, president and speech
      2. Breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    # Each tuple contains the year, name of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    # Convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    # Sort
    prs_yr_spch.sort()
    
    return prs_yr_spch

In [5]:
text = open('./../data/pres_speech/sou_all.txt', 'r').read()
regex = '_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}'
pres_speech_list = parse_text(text, regex)

# Instantiate the corpus class
corpus = Corpus(pres_speech_list, './../data/stopwords/stopwords.txt', 2)

### 2

Pick a dictionary (or dictionaries) of your choice from the Harvard IV set, the Loughran-McDonald set, or some other of your choosing that you think may be relevant for the data you collected. Then conduct the following exercise:
1. Use the two methods above to score each document in your data.
2. Explore whether the scores diﬀer according to the meta data ﬁelds you gathered: for example, do diﬀerent speakers/sources/etc tend to receive a higher score than others?
3. Do the answers to the previous question depend on whether tf-idf weighting is applied or not? Why do you think there is (or is not) a diﬀerence in your answers?


In [6]:
# Harvard IV set
file_handler = './../data/dictionary/inquirerbasic2.csv'
dictionary = np.loadtxt(open(file_handler, 'rb'), dtype = 'str',
                        delimiter = ';', skiprows = 1, comments = None)
our_dictionary = sorted(set(elem[0].rstrip('#01234256789').lower() for elem in dictionary))

# We'll look at the top 10 documents.
n = 10

# Document term matrix
scored_docs = corpus.dict_rank(our_dictionary, False, n)
print "The highest ranked documents using DTM are:"
for i in range(len(scored_docs)):
    print "{0} {1} {2}".format(scored_docs[i][0].year, scored_docs[i][0].pres, scored_docs[i][1])
print

# TF-IDF
scored_docs = corpus.dict_rank(our_dictionary, True, n)
print "The highest ranked documents using TF-IDF are:"
for i in range(len(scored_docs)):
    print "{0} {1} {2}".format(scored_docs[i][0].year, scored_docs[i][0].pres, scored_docs[i][1])

The highest ranked documents using DTM are:
1980 Carter 10862.0
1981 Carter 10790.0
1946 Truman 8777.0
1907 Roosevelt 7780.0
1910 Taft 7743.0
1912 Taft 7248.0
1979 Carter 7091.0
1905 Roosevelt 6980.0
1974 Nixon 6976.0
1906 Roosevelt 6820.0

The highest ranked documents using TF-IDF are:
1981 Carter 3126.28385887
1980 Carter 2783.87538596
1907 Roosevelt 2644.91021908
1906 Roosevelt 2517.71438185
1905 Roosevelt 2372.89968469
1908 Roosevelt 2276.13355816
1901 Roosevelt 2202.12924387
1898 McKinley 2153.85100093
1912 Taft 2150.15922904
1910 Taft 2143.43236803


In [7]:
scored_docs = corpus.dict_rank(our_dictionary, True, len(corpus.docs))
presidents = set([scored_doc[0].pres for scored_doc in scored_docs])
president_dictionary = {}
for president in presidents:
    scores = [scored_doc[1] for scored_doc in scored_docs if scored_doc[0].pres == president]
    president_dictionary[president] = sum(scores)/len(scores)

print ("Average scores by president, from lowest to highest:")
print
for pres_score in sorted(president_dictionary.items(), key=operator.itemgetter(1)):
    print "{0} {1} {2}".format(pres_score[0].rjust(15), '*' * int(pres_score[1]/32), int(pres_score[1]))

Average scores by president, from lowest to highest:

     Washington ******* 249
      Jefferson ******** 285
        Madison ********** 327
         Monroe *************** 480
          Adams *************** 481
         Wilson **************** 518
           Ford ******************** 657
         Hoover ********************* 673
     Eisenhower ********************** 733
         Taylor *********************** 757
        Harding *********************** 760
         Reagan ************************ 777
        Lincoln ************************ 784
        Johnson ************************ 792
          Tyler ************************** 832
           Bush ************************** 847
          Nixon ************************** 863
         Truman *************************** 874
       Coolidge *************************** 885
          Grant **************************** 924
        Jackson ***************************** 935
          Buren ****************************** 966
        Kenne

### 3

We will now do a sentiment analysis using the AFINN list of words. AFINN is a list of English words rated for valence with an integer between minus five (negative) and plus five (positive). The words have been manually labeled by Finn Årup Nielsen in 2009-2011. A positive valence score can be interpreted as the word conveying a postive emotion and vice versa. 

Load _AFINN-111.txt_ from ./data/AFINN. Inspect the contents of the file and write a function that converts it into a dictionary where the keys are words and values are the valence scores attributed to them. You may use the readme file for hints. 

In [8]:
def load_sentiment_dictionary(path):
    """
    description: load a sentiment dictionary
    input: path: the path to the dictionary
    """
    d = {}
    with open(path, 'rb') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t')
        for row in csv_reader:
            d[row[0]] = int(row[1])        

    return(d)

# Load the dictionary
sentiment_dictionary = load_sentiment_dictionary('../data/AFINN/AFINN-111.txt')

# Inspect sentiment dictionary
print 'Example word "limited":', sentiment_dictionary['limited']
print 'Example word "badly":', sentiment_dictionary['badly']
print 'Example word "fabulous":', sentiment_dictionary['fabulous']
print "Sentiment dictionary loaded with length {0}.".format(len(sentiment_dictionary))

Example word "limited": -1
Example word "badly": -3
Example word "fabulous": 4
Sentiment dictionary loaded with length 2477.


### 4
Now, use the presedential speeches from last week's HW to calculate its sentiment score. Match every word against the dictionary and come up with a metric that captures the sentiment value. If a word is not present mark its score as 0. Write a function that takes in a list of word and returns their sentiment score. What is the score of the speech you have been assigned? Which year, president gave the least and most positive speech?

In [9]:
def load_words_from_file(path):
    # Read the file content.
    file_handle = open(path)
    file_content = file_handle.read()
    file_handle.close()

    # Extract the content as JSON and get a copy of the speech text.
    speech = json.loads(file_content)[2]
    stripped_text = speech

    # For each nonalphanumeric character, replace with a space.  This is
    # safer than replacing with an empty string because some punctuation
    # separates words without a space, i.e. '--'.
    for char in ',.:;[]"?$:-':
        stripped_text = stripped_text.replace(char, ' ')

    # Split the string into words.
    word_list = stripped_text.split(' ')

    # Because of the way the punctuation was replaced with spaces, there are
    # instances of multiple adjacent spaces.  Therefore, empty strings appear
    # in the word list.  Remove these.
    word_list = [word.lower() for word in word_list if word != '']

    return(word_list)

def calculate_sentiment_for_speech(sentiment_dictionary, words):
    word_count = 0
    sentiment = 0
    for word in words:
        if sentiment_dictionary.has_key(word):
            sentiment += sentiment_dictionary[word]
            word_count += 1

    return float(sentiment)/float(word_count), sentiment

In [10]:
def print_results(sentiment_dictionary, path, friendly_string):
    words = load_words_from_file(path)
    sentiment, cumul_sent_score = calculate_sentiment_for_speech(sentiment_dictionary, words)
    display_str = "{0} : sentiment = {1} ; cumulative sentiment score = {2}"
    print display_str.format(friendly_string, sentiment, cumul_sent_score)

print_results(sentiment_dictionary,
              "../data/pres_speech/1977_Ford_Matthew.txt",
              "1977 Ford")
print_results(sentiment_dictionary,
              "../data/pres_speech/1967_Johnson_Roger.txt",
              "1967 Johnson")
print_results(sentiment_dictionary,
              "../data/pres_speech/1897_McKinley_miquel.txt",
              "1897 McKinley")

1977 Ford : sentiment = 0.768802228412 ; cumulative sentiment score = 276
1967 Johnson : sentiment = 0.484955752212 ; cumulative sentiment score = 274
1897 McKinley : sentiment = 0.261044176707 ; cumulative sentiment score = 195


As is evident from the data above, of the three speeches we have analyzed, McKinley in 1897 had the lowest sentiment, while Ford in 1977 had the highest sentiment.