In [52]:
import random
import numpy as np
import nltk
import nltk.classify.util
from nltk.metrics import *
import collections
from nltk.classify import NaiveBayesClassifier, MaxentClassifier, SklearnClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

In [53]:
def read_file(uri):
    with open(uri, "rb") as f:
        lines = f.readlines()
    output = [line.strip().decode('cp1252') for line in lines]
    return output

In [54]:
def train_test_split(neg_sents, pos_sents, seed):
    random.seed(seed)
    random.shuffle(neg_sents)
    negcutoff = int(len(neg_sents)*0.75)
    random.shuffle(pos_sents)
    poscutoff = int(len(pos_sents)*0.75)    
    negtrain = neg_sents[:negcutoff]
    negtest = neg_sents[negcutoff:]
    postrain = pos_sents[:poscutoff]
    postest = pos_sents[poscutoff:]
    return {'negtrain':negtrain, 'negtest':negtest, 'postrain':postrain, 'postest':postest }

In [55]:
def pre_process(sent):
    stop_words = set(stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
    stemmer = SnowballStemmer('english')
    filtered = [word for word in sent if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered] 
    return stemmed

In [56]:
def feature_unigram_bow(sent, pre_flag = False):
    tokens = word_tokenize(sent)
    if pre_flag == True:
        tokens = pre_process(tokens)
    return dict([(token, True) for token in tokens])

In [57]:
def feature_bigram_bow(sent, pre_flag = False):
    tokens = word_tokenize(sent)
    if pre_flag == True:
        tokens = pre_process(tokens)
    bigrams=ngrams(tokens,2)
    return dict([(bigram, True) for bigram in bigrams])

In [58]:
def feature_trigram_bow(sent, pre_flag = False):
    tokens = word_tokenize(sent)
    if pre_flag == True:
        tokens = pre_process(tokens)
    trigrams=ngrams(tokens,3)
    return dict([(trigram, True) for trigram in trigrams])

In [59]:
def feature_uni_bigram_bow(sent, pre_flag = False):
    unigram = feature_unigram_bow(sent,pre_flag)
    bigram = feature_bigram_bow(sent,pre_flag)
    unigram.update(bigram)
    return unigram

In [60]:
def feature_uni_bi_trigram_bow(sent, pre_flag = False):
    unigram = feature_unigram_bow(sent,pre_flag)
    bigram = feature_bigram_bow(sent,pre_flag)
    trigram = feature_trigram_bow(sent,pre_flag)
    unigram.update(bigram)
    unigram.update(trigram)
    return unigram

In [61]:
def feature_tfidf(sents, min_df, max_df):
    vectorizer = TfidfVectorizer(min_df = min_df,max_df = max_df,sublinear_tf=True, use_idf=True,stop_words='english')
    tfidf = vectorizer.fit_transform(sents) 
    return tfidf

In [62]:
def NB_CV_classifier(corpus, labels , fold, feature, pre_flag = False):
    kf = StratifiedKFold(n_splits=fold)      
    accuracy = []
    precision = []
    recall = []
    fscore = []
    for train_index, test_index in kf.split(corpus,labels):
        X_train = [corpus[i] for i in train_index]
        X_test = [corpus[i] for i in test_index]
        y_train = [labels[i] for i in train_index]
        y_test = [labels[i] for i in test_index]
        train_feats = [feature(sent,pre_flag) for sent in X_train]
        training_set = list(zip(train_feats, y_train))
        test_feats = [feature(sent,pre_flag) for sent in X_test]
        test_set = list(zip(test_feats, y_test))
        classifier = NaiveBayesClassifier.train(training_set)
        pred = classifier.classify_many(test_feats)
        cm = ConfusionMatrix(pred, y_test)
        accuracy.append(nltk.classify.accuracy(classifier, test_set))
        TN = cm.__getitem__(('neg','neg'))
        TP = cm.__getitem__(('pos','pos'))
        FN = cm.__getitem__(('neg','pos'))
        FP = cm.__getitem__(('pos','neg'))
        precision.append(TP/(TP+FP))
        recall.append(TP/(TP+FN))
        fscore.append(2*(TP/(TP+FN))*(TP/(TP+FP))/((TP/(TP+FP)) +(TP/(TP+FN))))
    accuracy = sum(accuracy)/len(accuracy)
    precision = sum(precision)/len(precision)
    recall = sum(recall)/len(recall)
    fscore = sum(fscore)/len(fscore)
    print([accuracy,precision,recall,fscore])

In [63]:
def SVM_CV_classifier(corpus, labels , fold, feature, pre_flag = False):
    kf = StratifiedKFold(n_splits=fold)      
    accuracy = []
    precision = []
    recall = []
    fscore = []
    for train_index, test_index in kf.split(corpus,labels):
        X_train = [corpus[i] for i in train_index]
        X_test = [corpus[i] for i in test_index]
        y_train = [labels[i] for i in train_index]
        y_test = [labels[i] for i in test_index]
        train_feats = [feature(sent,pre_flag) for sent in X_train]
        training_set = list(zip(train_feats, y_train))
        test_feats = [feature(sent,pre_flag) for sent in X_test]
        test_set = list(zip(test_feats, y_test))
        classifier = SklearnClassifier(LinearSVC(), sparse=False)
        classifier.train(training_set)
        pred = classifier.classify_many(test_feats)
        cm = ConfusionMatrix(pred, y_test)
        accuracy.append(nltk.classify.accuracy(classifier, test_set))
        TN = cm.__getitem__(('neg','neg'))
        TP = cm.__getitem__(('pos','pos'))
        FN = cm.__getitem__(('neg','pos'))
        FP = cm.__getitem__(('pos','neg'))
        precision.append(TP/(TP+FP))
        recall.append(TP/(TP+FN))
        fscore.append(2*(TP/(TP+FN))*(TP/(TP+FP))/((TP/(TP+FP)) +(TP/(TP+FN))))
    accuracy = sum(accuracy)/len(accuracy)
    precision = sum(precision)/len(precision)
    recall = sum(recall)/len(recall)
    fscore = sum(fscore)/len(fscore)
    print([accuracy,precision,recall,fscore])

In [64]:
neg_file = read_file('/home/lingwei_shu/sentiment_analysis/rt-polarity.neg')
pos_file = read_file('/home/lingwei_shu/sentiment_analysis/rt-polarity.pos')
seed = 1234
train_test = train_test_split(neg_file,pos_file,seed )
negtrain = train_test['negtrain']
postrain = train_test['postrain']
corpus = negtrain + postrain
labels = np.array(['neg'] * len(negtrain) + ['pos'] * len(postrain))

In [65]:
NB_CV_classifier(corpus, labels , 10, feature_unigram_bow, True)

[0.7618869047619048, 0.7468986556484418, 0.7926522556390978, 0.7689807718992656]


In [66]:
NB_CV_classifier(corpus, labels , 10, feature_unigram_bow, False)

[0.7687575187969925, 0.7600870825673958, 0.7868959899749373, 0.7729349159270598]


In [67]:
NB_CV_classifier(corpus, labels , 10, feature_bigram_bow, True)

[0.6265642230576441, 0.6002244843593164, 0.7586309523809524, 0.6701561333021071]


In [68]:
NB_CV_classifier(corpus, labels , 10, feature_bigram_bow, False)

[0.7253671679197995, 0.7174089800270735, 0.7436290726817042, 0.7302207685854503]


In [69]:
NB_CV_classifier(corpus, labels , 10, feature_trigram_bow,True)

[0.5238881578947369, 0.5127577532822329, 0.9607337092731829, 0.6686346826476888]


In [70]:
NB_CV_classifier(corpus, labels , 10, feature_trigram_bow,False)

[0.6365682957393484, 0.6147625971225285, 0.7321253132832081, 0.6682365562486183]


In [71]:
NB_CV_classifier(corpus, labels , 10, feature_uni_bigram_bow, True)

[0.7656340852130324, 0.7539507224087441, 0.7886466165413533, 0.77083119569254]


In [72]:
NB_CV_classifier(corpus, labels , 10, feature_uni_bigram_bow, False)

[0.777264097744361, 0.768550873004539, 0.7938984962406015, 0.7808512677099858]


In [73]:
NB_CV_classifier(corpus, labels , 10, feature_uni_bi_trigram_bow,True)

[0.7641343984962405, 0.7526613084052188, 0.7868953634085212, 0.769333406649152]


In [74]:
NB_CV_classifier(corpus, labels , 10, feature_uni_bi_trigram_bow,False)

[0.773639097744361, 0.7663868512570801, 0.7878991228070176, 0.7768045648838927]


In [75]:
kf = StratifiedKFold(n_splits=10)
min_array = list(range(10))
max_array = np.arange(start = 0.4, stop = 1.05, step = 0.05)
accurMatNB = np.zeros((len(min_array),len(max_array)))  
accurMatSVM = np.zeros((len(min_array),len(max_array)))  
for i in range(len(min_array)):
    for j in range(len(max_array)):
        print(min_array[i],max_array[j])
        totalsvm = 0          
        totalNB = 0
        totalMatSvm = np.zeros((2,2))  
        totalMatNB = np.zeros((2,2))
        for train_index, test_index in kf.split(corpus,labels):
            X_train = [corpus[i] for i in train_index]
            X_test = [corpus[i] for i in test_index]
            y_train = [labels[i] for i in train_index]
            y_test = [labels[i] for i in test_index]
            vectorizer = TfidfVectorizer(min_df = min_array[i],max_df = max_array[j],sublinear_tf=True, use_idf=True,stop_words='english')
            train_corpus_tf_idf = vectorizer.fit_transform(X_train) 
            test_corpus_tf_idf = vectorizer.transform(X_test)

            model1 = LinearSVC()
            model2 = MultinomialNB()    
            model1.fit(train_corpus_tf_idf,y_train)
            model2.fit(train_corpus_tf_idf,y_train)
            result1 = model1.predict(test_corpus_tf_idf)
            result2 = model2.predict(test_corpus_tf_idf)

            totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
            totalMatNB = totalMatNB + confusion_matrix(y_test, result2)
            totalsvm = totalsvm+sum(y_test==result1)
            totalNB = totalNB+sum(y_test==result2)

        print(totalMatSvm)
        accurMatSVM[i,j] = totalsvm/7996
        print(totalMatNB)
        accurMatNB[i,j] = totalNB/7996

0 0.4
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.45
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.5
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.55
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.6
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.65
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.7
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.75
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.8
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.85
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.9
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 0.95
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
0 1.0
[[ 2944.  1054.]
 [ 1013.  2985.]]
[[ 2980.  1018.]
 [  879.  3119.]]
1 0.4


[[ 2815.  1183.]
 [ 1153.  2845.]]
[[ 2892.  1106.]
 [ 1073.  2925.]]
8 0.65
[[ 2815.  1183.]
 [ 1153.  2845.]]
[[ 2892.  1106.]
 [ 1073.  2925.]]
8 0.7
[[ 2815.  1183.]
 [ 1153.  2845.]]
[[ 2892.  1106.]
 [ 1073.  2925.]]
8 0.75
[[ 2815.  1183.]
 [ 1153.  2845.]]
[[ 2892.  1106.]
 [ 1073.  2925.]]
8 0.8
[[ 2815.  1183.]
 [ 1153.  2845.]]
[[ 2892.  1106.]
 [ 1073.  2925.]]
8 0.85
[[ 2815.  1183.]
 [ 1153.  2845.]]
[[ 2892.  1106.]
 [ 1073.  2925.]]
8 0.9
[[ 2815.  1183.]
 [ 1153.  2845.]]
[[ 2892.  1106.]
 [ 1073.  2925.]]
8 0.95
[[ 2815.  1183.]
 [ 1153.  2845.]]
[[ 2892.  1106.]
 [ 1073.  2925.]]
8 1.0
[[ 2815.  1183.]
 [ 1153.  2845.]]
[[ 2892.  1106.]
 [ 1073.  2925.]]
9 0.4
[[ 2797.  1201.]
 [ 1180.  2818.]]
[[ 2896.  1102.]
 [ 1116.  2882.]]
9 0.45
[[ 2797.  1201.]
 [ 1180.  2818.]]
[[ 2896.  1102.]
 [ 1116.  2882.]]
9 0.5
[[ 2797.  1201.]
 [ 1180.  2818.]]
[[ 2896.  1102.]
 [ 1116.  2882.]]
9 0.55
[[ 2797.  1201.]
 [ 1180.  2818.]]
[[ 2896.  1102.]
 [ 1116.  2882.]]
9 0.6
[[ 279

In [76]:
accurMatNB

array([[ 0.76275638,  0.76275638,  0.76275638,  0.76275638,  0.76275638,
         0.76275638,  0.76275638,  0.76275638,  0.76275638,  0.76275638,
         0.76275638,  0.76275638,  0.76275638],
       [ 0.76275638,  0.76275638,  0.76275638,  0.76275638,  0.76275638,
         0.76275638,  0.76275638,  0.76275638,  0.76275638,  0.76275638,
         0.76275638,  0.76275638,  0.76275638],
       [ 0.76250625,  0.76250625,  0.76250625,  0.76250625,  0.76250625,
         0.76250625,  0.76250625,  0.76250625,  0.76250625,  0.76250625,
         0.76250625,  0.76250625,  0.76250625],
       [ 0.758004  ,  0.758004  ,  0.758004  ,  0.758004  ,  0.758004  ,
         0.758004  ,  0.758004  ,  0.758004  ,  0.758004  ,  0.758004  ,
         0.758004  ,  0.758004  ,  0.758004  ],
       [ 0.75062531,  0.75062531,  0.75062531,  0.75062531,  0.75062531,
         0.75062531,  0.75062531,  0.75062531,  0.75062531,  0.75062531,
         0.75062531,  0.75062531,  0.75062531],
       [ 0.74424712,  0.744247

In [77]:
accurMatSVM

array([[ 0.74149575,  0.74149575,  0.74149575,  0.74149575,  0.74149575,
         0.74149575,  0.74149575,  0.74149575,  0.74149575,  0.74149575,
         0.74149575,  0.74149575,  0.74149575],
       [ 0.74149575,  0.74149575,  0.74149575,  0.74149575,  0.74149575,
         0.74149575,  0.74149575,  0.74149575,  0.74149575,  0.74149575,
         0.74149575,  0.74149575,  0.74149575],
       [ 0.73174087,  0.73174087,  0.73174087,  0.73174087,  0.73174087,
         0.73174087,  0.73174087,  0.73174087,  0.73174087,  0.73174087,
         0.73174087,  0.73174087,  0.73174087],
       [ 0.72723862,  0.72723862,  0.72723862,  0.72723862,  0.72723862,
         0.72723862,  0.72723862,  0.72723862,  0.72723862,  0.72723862,
         0.72723862,  0.72723862,  0.72723862],
       [ 0.72386193,  0.72386193,  0.72386193,  0.72386193,  0.72386193,
         0.72386193,  0.72386193,  0.72386193,  0.72386193,  0.72386193,
         0.72386193,  0.72386193,  0.72386193],
       [ 0.71985993,  0.719859

In [79]:
# test set
negtest = train_test['negtest']
postest = train_test['postest']
corpus = negtest + postest
labels = np.array(['neg'] * len(negtest) + ['pos'] * len(postest))
NB_CV_classifier(corpus, labels , 10, feature_uni_bigram_bow, False)

[0.7212377959824934, 0.7432364007529033, 0.6781506003815508, 0.7087010915547691]
