# A Vector Space Model, with scikit-learn Naive Bayes

In [1]:
%matplotlib inline
import csv
import pandas
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import StratifiedKFold, cross_val_score 



In [5]:
# Read the data
reviews = [line.rstrip() for line in open("/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata_2column.txt")]
print(len(reviews))

50001


In [6]:
# The data have a header and we print it
print(reviews[0])
# print first data point.
# data format is each review as a line, csv
# clomun one is the sentiment tag --> 1=positive sentiment, 0=negative sentiment
# column 2 is the review
print(reviews[1])

"label","message"
"1","bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as "" teachers "" . my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is "" teachers "" . the scramble to survive financially  the insightful students who can see right through their pathetic teachers' pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line : inspector : i'm here to sack one of your teachers . student : welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn't !"


In [8]:
# Let's actually read the file again with pandas
import csv
import pandas as pd
reviews = pd.read_csv("/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata_2column.txt",\
                      sep=',', quoting=csv.QUOTE_NONE,  names=["label", "message"])

# Let's print a preview with the "head" command
reviews.head(n=5)

Unnamed: 0,label,message
0,"""label""","""message"""
1,"""1""","""bromwell high is a cartoon comedy . it ran at..."
2,"""1""","""homelessness ( or houselessness as george car..."
3,"""1""","""brilliant over-acting by lesley ann warren . ..."
4,"""1""","""this is easily the most underrated film inn t..."


In [9]:
reviews_data=reviews["message"]
reviews_tags=reviews["label"]

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer='word')),  # get counts of tokens
    ('tfidf', TfidfTransformer()),  # get tf-idf scores
    ('classifier', MultinomialNB()),  # train on tf-idf vectors  with the Naive Bayes classifier
])

# Do 10-fold cross validation
scores = cross_val_score(pipeline,  
                         reviews_data,  
                         reviews_tags,  
                         cv=10, 
                         scoring='accuracy',
                         n_jobs=-1, # use all machine cores
                         )
print(scores)



[ 0.82083583  0.8122      0.8234      0.8028      0.8204      0.834       0.8218
  0.8372      0.8034      0.8082    ]


In [11]:
avg= sum(scores/10.0)
print(avg)

0.818423583283


# Code fr 

In [8]:
import nltk
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Alex')

{'last_letter': 'x'}

In [9]:
gender_features('Nicole')

{'last_letter': 'e'}

In [1]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +\
                 [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)

In [20]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [21]:
featuresets[0]

({'last_letter': u'i'}, 'female')

In [14]:
train_set[0:10]

[({'last_letter': u'i'}, 'female'),
 ({'last_letter': u'b'}, 'male'),
 ({'last_letter': u't'}, 'male'),
 ({'last_letter': u'e'}, 'female'),
 ({'last_letter': u'n'}, 'male'),
 ({'last_letter': u'y'}, 'female'),
 ({'last_letter': u'e'}, 'female'),
 ({'last_letter': u'a'}, 'female'),
 ({'last_letter': u'e'}, 'female'),
 ({'last_letter': u'a'}, 'female')]

In [10]:
classifier.classify(gender_features('Rebecca'))

'female'

In [11]:
classifier.classify(gender_features('Jordon'))

'male'

In [23]:
classifier.classify(gender_features('Vivienne'))

'female'

In [15]:
print(nltk.classify.accuracy(classifier, test_set))

0.746


In [16]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = u'a'           female : male   =     34.4 : 1.0
             last_letter = u'k'             male : female =     32.7 : 1.0
             last_letter = u'f'             male : female =     16.6 : 1.0
             last_letter = u'p'             male : female =     11.9 : 1.0
             last_letter = u'v'             male : female =     11.2 : 1.0


In [17]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [18]:
type(train_set)

nltk.util.LazyMap

In [19]:
print(train_set[0])

({'last_letter': u'i'}, 'female')
