------------------------------
------------------------------
### MARKKLICK206
Mark Klick
4/17/2018
------------------------------
------------------------------

In [1]:
import json
#import urllib2
import sys
import time
#import cPickle
import urllib
import re
import random
import itertools
import collections
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.metrics import precision
from nltk.metrics import recall
from nltk.metrics import f_measure

### Follow this tutorial to set up twitter credentials for the Twitter API
http://www.nltk.org/howto/twitter.html

In [2]:
from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile
from nltk.corpus import twitter_samples

In [None]:
oauth = credsfromfile()

In [None]:
nltk.download()

### This is where I'll begin the twitter 'sentiment' classification i.e. use twitter handles - starting with 1 handle vs another handle - we can group several handles together and call that a group then compare the two groups as well...

### Hopefully there will be some interesting/informative results when we try and classify sentences in our test set and correctly/incorrectly identify which twitter-handle/person those sentences came from...

### Let's start with a 70-30 split and see how well the nltk.NaiveBayesClassifier will perform when trying to distinguish/classify tweets from two different twitter handles...

### First we need to use our nltk twitter streamer to grab our twitter corpi for our two handles we are interested in (trump and obama in this first toy example)... and conveniently write those to 2 seperate json files... to make life easy when we convert them into tuples i.e. (sentences, twitter_handle)...

In [None]:
#https://tweeterid.com/
#trump-25073877, obama-813286

# create our streamer client
client = Streamer(**oauth)

### create our trump-corpus with 500 tweets 

In [None]:
trump_id =['25073877']

client.register(TweetWriter(limit=500))
client.statuses.filter(follow=trump_id)

### look at the raw tweet strings

In [3]:
t_strings = twitter_samples.strings('trump_tweets.json')
#for string in t_strings[:50]:
    #print(string)

### look at the tokenzied tweets

In [5]:
t_tokenized = twitter_samples.tokenized('trump_tweets.json')
#for toks in t_tokenized[:50]:
    #print(toks)

In [6]:
print len(t_strings)
print len(t_tokenized)

500
500


### create our obama-corpus with 500 tweets 

In [None]:
obama_id =['813286']

client.register(TweetWriter(limit=500))
client.statuses.filter(follow=obama_id)

In [4]:
o_strings = twitter_samples.strings('obama_tweets.json')
#for string in o_strings[:50]:
    #print(string)

In [8]:
o_tokenized = twitter_samples.tokenized('obama_tweets.json')
#for toks in o_tokenized[:50]:
    #print(toks)

In [5]:
len(o_strings)

236

### nltk twitter streamer is getting stuck when trying to grab 500 obama tweets - we'll just stick to 230 for now - the number of tweets is arbitrary once we get the pipeline up and running i.e. you can plug in any twitter handle and the amount of tweets you wish to grab for the corpus...

### start pre-processing our data into the format we like i.e. tuples (sentence,twitter-handle). 2 ways we can do this, I'll try both:

>use the nltk.twitter_samples

>use my pre-processing workflow

### create the string_class_tuple i.e. a tuple with (tweet, handle)

In [6]:
trump_list = [(''.join(t_tweets), 'trump') for t_tweets in t_strings]

obama_list = [(''.join(t_tweets), 'obama') for o_tweets in o_strings]

pres_corpus = trump_list[:230] + obama_list[:230]

random.shuffle(pres_corpus)

### define our prepocessing pipepline and make it into a function
> first we tokenize each tweet and then we stem each token

In [7]:
def pre_proc(string_class_tuple):
    #preprocess the tweets with our existing token/stem pipeline?
    #hold each processed sentence strings 
    train_set_proc = []
    #bag of stemmed words 
    all_words = []
    #iterate over each tweet string - tokenize sentences - tok. words - stem
    for i in string_class_tuple:
        doc = []
        for sentence in sent_tokenize(i[0]):
            words = [word.lower() for word in TreebankWordTokenizer().tokenize(sentence) if word.lower() not in stop_word_tuple and re.search("^[a-zA-Z]+$", word)]
            #words = ' '.join([SnowballStemmer("english").stem(word).lower() for word in words])
            wordstems = [SnowballStemmer("english").stem(word).lower() for word in words]
            doc.append(' '.join(wordstems))
            for word in wordstems:
                all_words.append(word)
        #print '-'
        train_set_proc.append(doc)
    #tuples dont support assignment so we have to re-create trainset with processed
    string_class_tuple = [ (train_set_proc[idx],i[1]) for idx,i in enumerate(string_class_tuple)]
    return all_words, string_class_tuple

In [8]:
#list of common stop words
stop_word_tuple = tuple(stopwords.words('english'))

### create our bag of words and pre-processed data set...

In [9]:
all_words, proc_train_set = pre_proc(pres_corpus)

### Implement our classify: defining features we will use to determine which class a tweet is from. We have several features that have been chosen for various reasons which can be seen here (link to readme)...

In [15]:
#define a feature extractor 
def document_features(document,word_features):
    document_words = set(' '.join(document).split())
    #document_words = set(document))
    #print document_words
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

#Use significant bigrams i.e. measure the chi-square collocation correlation 
#between words in the bigram. Here we use the top N bigrams as features for 
#our classifier.
def document_bigram_features(document, N):
    #Use significant bigrams i.e. measure the chi-square collocation correlation 
    #between words in the bigram. Here we use the top N bigrams as features for 
    #our classifier
    #followed this example http://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/
    def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=N):
        bigram_finder = BigramCollocationFinder.from_words(words)
        bigrams = bigram_finder.nbest(score_fn, n)
        return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
    document_words = set(' '.join(document).split())
    return bigram_word_feats(document_words)

#Try a different strategy seperating positive and negative labled docs wordlist
#using a scoring method for feature selection based on a chi-square test that 
#eliminates low-information word features by using information gain for each word
#followed this example http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/
def eliminate_low_noninfo_fs(train, N):
    #creates lists of all positive and negative words
    pos_list = []
    neg_list = []
    for doc in train:
        if doc[1] == 'trump':
            for sent in doc[0]:
                pos_list.append(sent.split())
        elif doc[1] == 'obama':
            for sent in doc[0]:
                neg_list.append(sent.split())
    #create itertool chain objects with the word lists	
    pos_list = list(itertools.chain(*pos_list))
    neg_list = list(itertools.chain(*neg_list))
    #build frequency distibutions for pos, neg, and all words
    word_fd = nltk.probability.FreqDist()
    cond_word_fd = nltk.probability.ConditionalFreqDist()
    for word in pos_list:
        word_fd[word] += 1
        cond_word_fd['trump'][word] += 1
    for word in neg_list:
        word_fd[word] += 1
        cond_word_fd['obama'][word] += 1
    #counts for pos, neg, and all words
    pos_word_count = cond_word_fd['trump'].N()
    neg_word_count = cond_word_fd['obama'].N()
    total_word_count = pos_word_count + neg_word_count
    #populate score dictionary with pos, neg, and all words chi-square scores
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['trump'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['obama'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    #pick the top most informative words based on the calculated scores
    #NOTE: vary this parameter to see how our classifier performs
    #EDIT: made the function take N as an input
    best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:N]
    best_words = set([w for w, s in best_vals])
    return best_words
    
def evaluate_clssifier(classifier,eval_set):
    print('classifier accuracy: ' + str(nltk.classify.accuracy(classifier, eval_set)))
    classifier.show_most_informative_features(20)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(eval_set):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    print 'pos precision:', precision(refsets['trump'], testsets['trump']) 
    print 'pos recall:', recall(refsets['trump'], testsets['trump'])
    print 'pos F-measure:', f_measure(refsets['trump'], testsets['trump'])
    print 'neg precision:', precision(refsets['obama'], testsets['obama'])
    print 'neg recall:', recall(refsets['obama'], testsets['obama'])
    print 'neg F-measure:', f_measure(refsets['obama'], testsets['obama'])

#Here we define our function to run and test the Naive Bayes classifier
#with different feature sets
def word_feature_model(train, all_words, N, BI=False):
    if BI==True:
        #split the data into train and dev
        train, dev = train[:367], train[368:459]
        train = [(document_bigram_features(d,N), c) for (d,c) in train]
        dev = [(document_bigram_features(d,N), c) for (d,c) in dev]
        #use nltks prepackaged Naive Bayes classifier
        classifier = nltk.NaiveBayesClassifier.train(train)
        #Finall, we evaluate the classifier's performance metrics!
        evaluate_clssifier(classifier,dev)
    else:
        #define our first feature: calculate N most frequent words in our corpus,
        #then check if N words are present in each review
        #followed the example in nltk book ch.6
        all_words = nltk.FreqDist(w for w in all_words)
        #create our feature list
        #NOTE: we are using whatver word features are input for all_words parameter
        word_features = list(all_words) 
        
        #split the data into train and dev
        train, dev = train[:367], train[368:459]
        
        #invoke the feature extractor on each processed movie reviews
        train = [(document_features(d,word_features), c) for (d,c) in train]
        dev = [(document_features(d,word_features), c) for (d,c) in dev]

        #use nltks prepackaged Naive Bayes classifier
        classifier = nltk.NaiveBayesClassifier.train(train)
        #Finall, we evaluate the classifier's performance metrics!
        evaluate_clssifier(classifier,dev)

### look at what words (stems/tokens) our feature extractor determines as the top 2000 most informatiove words in the `pres_corpus`

In [11]:
eliminate_low_noninfo_fs(proc_train_set,2000)

{u'abus',
 u'accept',
 u'accus',
 u'accusrd',
 u'action',
 u'actual',
 u'administr',
 u'admit',
 u'advis',
 u'afraid',
 u'ag',
 u'agenc',
 u'ago',
 u'aircraft',
 u'alien',
 u'aliv',
 u'alreadi',
 u'also',
 u'alway',
 u'amazon',
 u'amen',
 u'america',
 u'american',
 u'americano',
 u'amount',
 u'amp',
 u'angrili',
 u'anonym',
 u'anoth',
 u'answer',
 u'anyon',
 u'appoint',
 u'approv',
 u'asham',
 u'ask',
 u'ass',
 u'asshol',
 u'attack',
 u'attent',
 u'attorney',
 u'away',
 u'awhil',
 u'back',
 u'bad',
 u'ball',
 u'bankrupt',
 u'barrier',
 u'bbcworld',
 u'becom',
 u'bed',
 u'behind',
 u'believ',
 u'bet',
 u'big',
 u'biggest',
 u'bitch',
 u'bitter',
 u'black',
 u'bless',
 u'blind',
 u'book',
 u'border',
 u'boy',
 u'brain',
 u'break',
 u'bring',
 u'broken',
 u'build',
 u'bulli',
 u'bunni',
 u'bus',
 u'busi',
 u'buy',
 u'cabal',
 u'cabinet',
 u'call',
 u'came',
 u'canada',
 u'caravan',
 u'carri',
 u'chanc',
 u'chang',
 u'chief',
 u'china',
 u'chronicpain',
 u'civilian',
 u'classifi',
 u'clear

### use the word_feature_model wrapper to run our classification task on our pre-processed data using various methods including using the 2000 most informative words as predictive features!
> what kind of features are used by the classifier?

### Let's try some of our methods out!

In [17]:
#1: Naive method: use all the words as features to classify movie reviews
word_feature_model(proc_train_set,all_words,None)

#2: Method using the top N frequent occuring words to classify movie revs
word_feature_model(proc_train_set,all_words[:2000],None)

#3: Method using the top N 'most informative' words to classify each movie rev
word_feature_model(proc_train_set,eliminate_low_noninfo_fs(proc_train_set,2000),None)

#4: Method using the bigram collocations of the top N bigrams
word_feature_model(proc_train_set,None,200,True)

classifier accuracy: 1.0
Most Informative Features
           contains(get) = True            obama : trump  =    119.7 : 1.0
           contains(use) = True            obama : trump  =     71.8 : 1.0
           contains(see) = True            obama : trump  =     71.8 : 1.0
       contains(corrupt) = True            obama : trump  =     71.8 : 1.0
         contains(https) = True            obama : trump  =      2.8 : 1.0
            contains(rt) = False           obama : trump  =      1.7 : 1.0
         contains(comey) = False           obama : trump  =      1.2 : 1.0
         contains(crook) = False           obama : trump  =      1.2 : 1.0
           contains(lie) = False           obama : trump  =      1.2 : 1.0
       contains(hillari) = False           obama : trump  =      1.1 : 1.0
      contains(congress) = False           obama : trump  =      1.1 : 1.0
          contains(long) = False           obama : trump  =      1.1 : 1.0
             contains(g) = False           obama 