## Building a classifier

Here we will use UCI Machine Learning's Sentiment Dataset to make a new classifier using Naive Bayes

The dataset contains 1000+ examples of negative and positive reviews from yelp, imdb, and amazon.

In [4]:
from pandas import DataFrame as df

folder = 'sentiment labelled sentences/'
imdb_sent = df.from_csv(folder+'imdb_labelled.txt',sep='\t',index_col=False, encoding = 'utf-8')
imdb_sent.columns = ['sentence','label']

print imdb_sent.head()
print imdb_sent.tail()

                                            sentence  label
0  Not sure who was more lost - the flat characte...      0
1  Attempting artiness with black & white and cle...      0
2       Very little music or anything to speak of.        0
3  The best scene in the movie was when Gerardo i...      1
4  The rest of the movie lacks art, charm, meanin...      0
                                              sentence  label
742  I just got bored watching Jessice Lange take h...      0
743  Unfortunately, any virtue in this film's produ...      0
744                   In a word, it is embarrassing.        0
745                               Exceptionally bad!        0
746  All in all its an insult to one's intelligence...      0


In [5]:
from nltk.corpus import stopwords

filtered_sent = []
for sentence in imdb_sent['sentence']:
    sentence = str(sentence.encode('utf-8'))
    filt_sent = [word for word in sentence.split() if word not in stopwords.words('english')]
    filtered_sent.append(filt_sent)
    
imdb_sent['sentence'] = filtered_sent



In [6]:
from nltk import FreqDist

def get_words_in_reviews(sentences):
    all_words = []
    for index, words in imdb_sent.iterrows():
        all_words.extend(words['sentence'])
    return all_words

def get_word_features(wordlist):
    wordlist = FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

word_features = get_word_features(get_words_in_reviews(imdb_sent['sentence']))

print word_features



In [11]:
import nltk

imdb_reviews= []
for index, review in imdb_sent.iterrows():
    imdb_reviews.append((review['sentence'],review['label']))

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

training_set = nltk.classify.apply_features(extract_features, imdb_reviews)

print training_set



In [12]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

print classifier.show_most_informative_features(10)

Most Informative Features
     contains(wonderful) = True                1 : 0      =      9.7 : 1.0
         contains(waste) = True                0 : 1      =      8.2 : 1.0
          contains(bad.) = True                0 : 1      =      7.5 : 1.0
          contains(show) = True                0 : 1      =      6.8 : 1.0
           contains(bad) = True                0 : 1      =      5.7 : 1.0
         contains(can't) = True                0 : 1      =      5.3 : 1.0
         contains(seen.) = True                0 : 1      =      5.3 : 1.0
      contains(actually) = True                1 : 0      =      5.3 : 1.0
         contains(liked) = True                1 : 0      =      5.3 : 1.0
         contains(would) = True                0 : 1      =      4.7 : 1.0
None


In [15]:
comments = [ 'steph curry is amazing',
             'trump is going to run this country down',
             'frank underwood is a smart snake',
              'pizza is bad' ]

for comment in comments:    
    print classifier.classify(extract_features(comment.split()))

1
1
1
0
