# homework 4 notebook:
creating a logistic regression classifier 

In [11]:
# In the chapter on logistic regression, the book suggests a very small number
# of features for classifying movie reviews
# x1: count of positive words in the document
# x2: count of negative words in the document
# x3: 1 if "no" is in document, 0 otherwise
# x4 count of first and second person pronouns
# x5 1 if "!" is in document, 0 otherwise
# x6 log(word count of document)

# let's see if it works with Stochastic Gradient Descent
import string

from nltk.corpus import movie_reviews
import random
import nltk
import math
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression


In [21]:
#the following block of code creates a dictionary of sentiment words whose value is 1 if it is a positive word
# and 0 if it is a negative word.
sentimentWordDictionary = {}
f = open('positive-words.txt', 'r', encoding="latin-1")
for line in f:
    line = line.strip()
    if len(line) == 0: # ignore this line
        continue
    if line[0] == ';': # ignore this line
        continue
    sentimentWordDictionary[line.lower()] = 1
f.close()
f = open('negative-words.txt', 'r', encoding="latin-1")
for line in f:
    line = line.strip()
    if len(line) == 0: # ignore this line
        continue
    if line[0] == ';': # ignore this line
        continue
    sentimentWordDictionary[line.lower()] = 0
f.close()

# for debugging purposes
print("There are", len(sentimentWordDictionary), "sentiment words.")

# Grab all the documents and shuffle them
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
print("There are", len(documents), "documents.") # good for debugging


There are 6786 sentiment words.
There are 2000 documents.


In [22]:
# this function builds feature x1 and x2 by finding the num of positive and negative words in a certain document
def countPositiveAndNegativeWords(d):
    '''
    Counts the number of positive and negative words in the document
    :param d: A list containing the words in the document
    :return: A tuple (positive, negative) with two integer values representing
            the number of positive and negative words
    '''
    countPositive = 0
    countNegative = 0
    # iterate through every word in the document (bag of words)
    for word in d:
        # reference the dictionary containing the seniment for each word
        # if the word is associated with a postive sentiment increment the count by 1
        if word in sentimentWordDictionary:
            if sentimentWordDictionary[word]== 1:
                countPositive+=1
            else:
                countNegative+=1
        
    # you have to do this
    return countPositive, countNegative


In [23]:
# this function builds feature x3 for each document, checks if no is in the document 
def noInDocument(d):
    '''
    Returns 1 if the word "no" is in the document.   You may
    want to contemplate whether you need to make this case sensitive or not.
    :param d: A list of words in the document.
    :return: 1 if no is in document; 0, otherwise.
    '''
    # convert all words to lowercase in the document first , will catch No and no
    words = [word.lower() for word in d]

    # check if the word no is in the list 
    if "no" in words:
        return 1
    # otherwise return 0 
    return 0

In [24]:
def countFirstSecondPersonPronouns(d):
    '''
    Returns a count of the number of first and second person pronouns
    within the document.  You might want to look up what is a first or second
    person pronoun.
    :param d: A list of words in the document.
    :return: The count of personal pronouns
    '''
    # create a list of first and second pronouns
    count = 0
    firstandsecondpronouns = ["i","we","you","me","us","my","our","your","mine","ours","yours","myself","ourselves","yourself","yourself"]
     # convert all words to lowercase in the document first
    words = [word.lower() for word in d]
    for word in words:
        if word in firstandsecondpronouns:
            count+=1
    return count

In [25]:

def exclamationInDocument(d):
    '''
    Returns 1 if the word "!" is in the document.
    :param d: A list of words in the document.
    :return: 1 if ! is in document; 0, otherwise.
    '''
    for word in d:
        if word == "!":
            return 1

    # complete this
    return 0


In [26]:
def logOfLength(d):
    '''
    Computes and returns the log of the number of tokens in the document.
    :param d: A list of words in the document.
    :return: log(number of words)
    '''
    # complete this
    # find the num of tokens 
    numtokens = len(d)
    
    return math.log(numtokens)

In [30]:
def document_features(document):
    '''
    Builds the set of features for each document.
    You don't need to modify this unless
    you want to add another feature.
    :param document: A list of words in the document.
    :return: A dictionary containing the features for that document.
    '''
    document_words = list(document) # do not turn into a set!!
    features = {}
    positive, negative = countPositiveAndNegativeWords(document_words)

    features['positiveCount']  = positive
    features['negativeCount'] = negative
    features['noInDoc'] = noInDocument(document_words)
    features['personalPronounCount'] = countFirstSecondPersonPronouns(document_words)
    features['exclamation'] = exclamationInDocument(document_words)
    features['logLength'] = logOfLength(document_words)

    return features

In [31]:
# for each document, extract its features
featuresets = [(document_features(d), c) for (d,c) in documents]

# build the training and test sets
trainingSize = int(0.8*len(featuresets))
train_set, test_set = featuresets[0:trainingSize], featuresets[trainingSize:]

# use stochastic gradient descent with log loss function
classifier = LogisticRegression(max_iter=1000, verbose=0)
x_Train = [list(a.values()) for (a,b) in train_set]
y_Train = [b for (a,b) in train_set]
classifier.fit(x_Train, y_Train)

# print(classifier.coef_)  # if you want to see the coefficients, unsorted

x_Test = [list(a.values()) for (a,b) in test_set]
y_Test = [b for (a,b) in test_set]

print("LR Fit", classifier.score(x_Test, y_Test))

LR Fit 0.72


In [32]:
#here is a block of code that sorts the features by absolute value
# and prints them out
featureNames = ['positiveCount', 'negativeCount', 'noInDoc', 'personalPronounCount',  'exclamation', 'logLength']
featuresPlusImportance = [ (featureNames[i], classifier.coef_[0][i]) for i in range(len(classifier.coef_[0]))]
featuresPlusImportance.sort(key = lambda x: abs(x[1]), reverse=True)
for x in range(len(featuresPlusImportance)):
    print(featuresPlusImportance[x])


('logLength', np.float64(0.6169818069093244))
('exclamation', np.float64(-0.42969835122407063))
('noInDoc', np.float64(-0.33406294331447434))
('positiveCount', np.float64(0.10237283510207737))
('negativeCount', np.float64(-0.08100240801434008))
('personalPronounCount', np.float64(-0.025524583773895115))


In [35]:
correct_tags = [c for (w, c) in test_set]
test_tags = list(classifier.predict(x_Test))

# how about its precision and recall per category
mtrx = nltk.ConfusionMatrix(correct_tags, test_tags)
print()
print(mtrx)
print()
print(mtrx.evaluate())



    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<137> 58 |
pos |  54<151>|
----+---------+
(row = reference; col = test)


Tag | Prec.  | Recall | F-measure
----+--------+--------+-----------
neg | 0.7173 | 0.7026 | 0.7098
pos | 0.7225 | 0.7366 | 0.7295



# Results 

The logistic regression classifier returned an accuracy score of 72%, meaning it used the fearures extracted to predict the sentiment correctly about 72% of the time. \mThe classifier returned a precision score of about 72% for both negative and postive sentiment movies. 

This means when a movie was classified as neg, how many times was it actually negative , the true positives out of the all the classified positives. a movie was classified 137 times as a negative movie, with 54 false negatives. 

It was similar for positive movies with 151 correctly classfied as positive with 58 false positives. this is not ideal but also it does not seem like the model favor negative or positive classifactions over the other, meaning it is a relativley "fair" classifier. 

The recall score was 70% for negative senitment and ~74% for positive sentiment. so of all postivley sentimented movie reviews, the classifer caught 74% percent of them, and for all negative sentiment, the classifer caught 70% of them.


The weights seem to make sense, I would have expected the positive and negative weights to be more important, but it could be that the words themselves individual may not be as important as the overall meaning, for example not and bad seperatley vs not bad, or not good. It is also interesting that lengthier reviews tend to be weighted more positivley by the model.

In [81]:
# rerunning the experiment with additional feauture: is a question mark present ? could be sarcasm or questioning a bad descision 
# novel feature x7: 
def questionMark(d):
    '''
    '''
    return d.count("?")    
    # check if question mark is present 
    for w in d:
        if w == "?":
            return 1
    return 0
    
    

In [82]:
# add the feature to the feature extractor
def document_features(document):
    '''
    Builds the set of features for each document.
    You don't need to modify this unless
    you want to add another feature.
    :param document: A list of words in the document.
    :return: A dictionary containing the features for that document.
    '''
    document_words = list(document) # do not turn into a set!!
    features = {}
    positive, negative = countPositiveAndNegativeWords(document_words)

    features['positiveCount']  = positive
    features['negativeCount'] = negative
    features['noInDoc'] = noInDocument(document_words)
    features['personalPronounCount'] = countFirstSecondPersonPronouns(document_words)
    features['exclamation'] = exclamationInDocument(document_words)
    features['logLength'] = logOfLength(document_words)
    features['Questionmark'] = questionMark(document_words)
    

    return features

In [83]:
# for each document, extract its features
featuresets = [(document_features(d), c) for (d,c) in documents]

# build the training and test sets
trainingSize = int(0.8*len(featuresets))
train_set, test_set = featuresets[0:trainingSize], featuresets[trainingSize:]

# use stochastic gradient descent with log loss function
classifier = LogisticRegression(max_iter=1000, verbose=0)
x_Train = [list(a.values()) for (a,b) in train_set]
y_Train = [b for (a,b) in train_set]
classifier.fit(x_Train, y_Train)

# print(classifier.coef_)  # if you want to see the coefficients, unsorted

x_Test = [list(a.values()) for (a,b) in test_set]
y_Test = [b for (a,b) in test_set]

print("LR Fit", classifier.score(x_Test, y_Test))

LR Fit 0.7175


In [84]:
#here is a block of code that sorts the features by absolute value
# and prints them out
featureNames = ['positiveCount', 'negativeCount', 'noInDoc', 'personalPronounCount',  'exclamation', 'logLength','Questionmark']
featuresPlusImportance = [ (featureNames[i], classifier.coef_[0][i]) for i in range(len(classifier.coef_[0]))]
featuresPlusImportance.sort(key = lambda x: abs(x[1]), reverse=True)
for x in range(len(featuresPlusImportance)):
    print(featuresPlusImportance[x])

('logLength', np.float64(0.6255510802576381))
('exclamation', np.float64(-0.40268747809956323))
('noInDoc', np.float64(-0.33105361857492815))
('positiveCount', np.float64(0.10220807934485596))
('negativeCount', np.float64(-0.08026400217535848))
('Questionmark', np.float64(-0.03929857442172685))
('personalPronounCount', np.float64(-0.02222245467601263))


In [85]:
correct_tags = [c for (w, c) in test_set]
test_tags = list(classifier.predict(x_Test))

# how about its precision and recall per category
mtrx = nltk.ConfusionMatrix(correct_tags, test_tags)
print()
print(mtrx)
print()
print(mtrx.evaluate())



    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<137> 58 |
pos |  55<150>|
----+---------+
(row = reference; col = test)


Tag | Prec.  | Recall | F-measure
----+--------+--------+-----------
neg | 0.7135 | 0.7026 | 0.7080
pos | 0.7212 | 0.7317 | 0.7264



# New feature: question mark
new feature (x8) checks whether a question mark is in the document. it returns 1 if it does, 0 otherwise.
I did this to see if it would capture rhetorical questions and sarcasm, or a critical tone.


After rerunning the model training and testing i got a confusion matrix

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<137> 58 |
pos |  55<150>|
----+---------+
(row = reference; col = test)

Compared to the previous model the matrix did not change significantly but they did shift a bit. The feature was weighted with a small negative number: -0.039, this means it only had a minor influence on classification, and reviews with question marks had a small tendancy to be negative. Positive classifaction was reduced by 1 . Precision, recall and f1 score did not change significantly. This may because question marks only occur in rare instances and so the feature matrix for it is sparse. 