In [2]:
import nltk
import json
from nltk.corpus import wordnet as wn
import re

syns = wn.synsets("anger")
print(syns)

[Synset('anger.n.01'), Synset('anger.n.02'), Synset('wrath.n.02'), Synset('anger.v.01'), Synset('anger.v.02')]


In [6]:
from textblob import TextBlob

example = TextBlob("angry")
print(example.words)
print(example.sentences)
print(example.tags)
print(example.sentiment)
print(example.detect_language())

['angry']
[Sentence("angry")]
[('angry', 'JJ')]
Sentiment(polarity=-0.5, subjectivity=1.0)
en


In [8]:
lineNumber = 1
with open('tinyTwitter.json', "r") as f:
    # read metadata from first row
    # {"total_rows":3877777,"offset":805584,"rows":[
    firstRow = json.loads(f.readline().rstrip()[:-1] + '0}')
    numRows = firstRow['total_rows']
    
    twitLine = f.readline()
    while twitLine:  # while not end of file
        #lineNumber += 1
        # truncate the end of line (\n)
        twitLine = twitLine.rstrip()
        if twitLine[-1] == ',':
            # truncate the last character
            # print('"," detected')
            twitLine = twitLine[:-1]
        if twitLine[0] == ']':
            # ignore the last line ']}'
            break

        # print ("process {} is processing ...".format(rank))
        twit = json.loads(twitLine)
        twit_text = re.sub(r"http\S+","",twit['value']['properties']['text'])
        print (twit_text)
        blob = TextBlob(twit_text)
        print (blob.sentiment)
        break

For the Oscars, Lady Gaga trained with a vocal coach DAILY for 6 months  #melbourne 
Sentiment(polarity=0.0, subjectivity=0.0)


In [12]:
import json
with open('sa2.json') as sa2:
    data = json.load(sa2)
    code_dict = {}
    for feature in data['features']:
        print (feature)
        code_dict[feature['properties']['SA2_Code_2011']] = feature
        break

{'geometry': {'coordinates': [[[[143.817818144, -37.56100644], [143.8174912, -37.562841825], [143.817114528, -37.564763975], [143.81697561599998, -37.56545293350001], [143.81694867200002, -37.56565093900001], [143.81670928, -37.566769874500004], [143.81621244800002, -37.56861833900001], [143.81617311999997, -37.568888217], [143.815870752, -37.570438887], [143.810498528, -37.5698114965], [143.81040195200003, -37.5698978175], [143.801001568, -37.5687896675], [143.79427222399994, -37.56800291800001], [143.78298697600002, -37.5666879565], [143.78281488000002, -37.5666710475], [143.75557145599998, -37.56348046349999], [143.748011008, -37.562609002500004], [143.738198016, -37.5614719925], [143.71563801599999, -37.558854002], [143.707238272, -37.5578780345], [143.69367420799995, -37.5582220235], [143.692926464, -37.558273953], [143.69230656000002, -37.558404710999994], [143.68592915200003, -37.56013037250001], [143.68494601600003, -37.551535920000006], [143.68694112, -37.5407070375], [143.686

In [3]:
import nltk
positive_tweets = nltk.corpus.twitter_samples.tokenized("positive_tweets.json")
negative_tweets = nltk.corpus.twitter_samples.tokenized("negative_tweets.json")

from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords
stop_word = list(stopwords.words())

# remove nonalphabetic by regular expression
def remove_nonalphabet(tweet_corpus):
    for tweet in tweet_corpus:
        for i in range(len(tweet)):
            token = re.sub('[^a-zA-Z]',"",tweet[i])
            tweet[i] = token

        while '' in tweet:     # remove tokens contain only non-alphabet words
            tweet.remove('')
    return tweet_corpus

# remove stopword 
def remove_stopword(tweet_corpus):
    new_corpus = []
    for tweet in tweet_corpus:
        token = []
        for word in tweet:
            if word.lower() not in stop_word:
                token.append(word)
        new_corpus.append(token)
    return new_corpus


positive_tweets = remove_nonalphabet(positive_tweets)      
negative_tweets = remove_nonalphabet(negative_tweets)


positive_tweets = remove_stopword(positive_tweets)
negative_tweets = remove_stopword(negative_tweets)

# randomly split train/test set for positive tweets and negative tweets
positive_train,positive_test,negative_train,negative_test = train_test_split(positive_tweets,negative_tweets,test_size = 0.1, train_size = 0.8)

# develop data = dataset - traindata - testdata
positive_set = positive_train + positive_test
positive_develop = [x for x in positive_tweets if x not in positive_set]

negative_set = negative_train + negative_test
negative_develop = [x for x in negative_tweets if x not in negative_set]

tweets_train = positive_train + negative_train
tweets_test = positive_test + negative_test
tweets_develop = positive_develop + negative_develop

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV
from collections import Counter

# build frequency dictionary
train_dict = []
c = Counter()
for tweet in tweets_train:
    train_dict.append({i:tweet.count(i) for i in set(tweet)})

test_dict = []
c = Counter()
for tweet in tweets_test:
    test_dict.append({i:tweet.count(i) for i in set(tweet)})

develop_dict = []
c = Counter()
for tweet in tweets_develop:
    develop_dict.append({i:tweet.count(i) for i in set(tweet)})

# prepare data for classifier
vectorizer = DictVectorizer()

train_ = vectorizer.fit_transform(train_dict)
test_ = vectorizer.transform(test_dict)
develop_ = vectorizer.transform(develop_dict)


# get label for dataset
def get_target(positive_list,negative_list):
    result = []
    for i in range(len(positive_list)):
        result.append("positive")
    for i in range(len(negative_list)):
        result.append("negative")
    return result
      
train_target = get_target(positive_train,negative_train)
test_target = get_target(positive_test,negative_test)
develop_target = get_target(positive_develop,negative_develop)


# tuning parameters for NB
alphas = np.array([0.0001,0.001,0.01,0.1,1,10])      # alternative parameter for Naive Bayes alpha
clf_NB = MultinomialNB()

grid = GridSearchCV(estimator = clf_NB, param_grid = dict(alpha = alphas),scoring = 'accuracy') # tuning for best alpha
grid.fit(develop_,develop_target)

print("For Naive Bayes tuning parameter alpha:")
best_alpha = grid.best_estimator_.alpha
print("best alpha is %f"%(best_alpha))


# prove optimal alpha is found
for i in alphas:
    clf_NB = MultinomialNB(alpha = i)
    clf_NB.fit(train_,train_target)
    result = clf_NB.score(develop_, develop_target)
    print("alpha = %9.4f has accuracy %f"%(i,result))


    

#tuning parameters for LR:
parameter_c = np.array([0.0001,0.001,0.01,0.1,1,10])   # alternative parameter for Logistic Regression
Penalty = ['l1','l2']
clf_LR = LogisticRegression()
grid2 = GridSearchCV(estimator = clf_LR, param_grid = dict(C = parameter_c,penalty = Penalty),scoring = 'accuracy')
grid2.fit(develop_,develop_target)

print("\n")
print("For Logistic Regresstion tuning parameter C and Penalty:")
best_c = grid2.best_estimator_.C
best_pen = grid2.best_estimator_.penalty
print("best C is %f, best Penalty is %s"%(best_c,best_pen))

# prove optimal parameters are found
for i in parameter_c:
    for pen in Penalty:
        clf_LR = LogisticRegression(C= i,penalty = pen)
        clf_LR.fit(train_,train_target)
        result = clf_LR.score(develop_, develop_target)
        print("C = %9.4f and penalty = %s show accuracy %f"%(i,pen,result))

For Naive Bayes tuning parameter alpha:
best alpha is 1.000000
alpha =    0.0001 has accuracy 0.722966
alpha =    0.0010 has accuracy 0.723996
alpha =    0.0100 has accuracy 0.726056
alpha =    0.1000 has accuracy 0.733265
alpha =    1.0000 has accuracy 0.742533
alpha =   10.0000 has accuracy 0.737384


For Logistic Regresstion tuning parameter C and Penalty:
best C is 0.010000, best Penalty is l2
C =    0.0001 and penalty = l1 show accuracy 0.501545
C =    0.0001 and penalty = l2 show accuracy 0.714727
C =    0.0010 and penalty = l1 show accuracy 0.501545
C =    0.0010 and penalty = l2 show accuracy 0.703399
C =    0.0100 and penalty = l1 show accuracy 0.501545
C =    0.0100 and penalty = l2 show accuracy 0.707518
C =    0.1000 and penalty = l1 show accuracy 0.650875
C =    0.1000 and penalty = l2 show accuracy 0.721936
C =    1.0000 and penalty = l1 show accuracy 0.707518
C =    1.0000 and penalty = l2 show accuracy 0.730175
C =   10.0000 and penalty = l1 show accuracy 0.706488
C =  

In [10]:
from sklearn.metrics import accuracy_score,f1_score

clf_NB = MultinomialNB(alpha = best_alpha)
clf_NB.fit(train_,train_target)
NB_result = clf_NB.predict(test_)
print("Naive Bayes:\n")
print("accuracy       :  %f"%(accuracy_score(test_target,NB_result)))
print("macro f-score  :  %f\n"%(f1_score(test_target,NB_result,average = 'macro')))

clf_LR = LogisticRegression(C = best_c,penalty = best_pen)
clf_LR.fit(train_,train_target)
LR_result = clf_LR.predict(test_)

print("Logistic Regression:\n")
print("accuracy       :  %f"%(accuracy_score(test_target,LR_result)))
print("macro f-score  :  %f"%(f1_score(test_target,LR_result,average = 'macro')))

Naive Bayes:

accuracy       :  0.755000
macro f-score  :  0.754410

Logistic Regression:

accuracy       :  0.720000
macro f-score  :  0.715627
