In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
yelp_df = pd.read_csv("../mltext2/data/yelp.csv")
yelp_df.describe()

Unnamed: 0,stars,cool,useful,funny
count,10000.0,10000.0,10000.0,10000.0
mean,3.7775,0.8768,1.4093,0.7013
std,1.214636,2.067861,2.336647,1.907942
min,1.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,1.0,0.0
75%,5.0,1.0,2.0,1.0
max,5.0,77.0,76.0,57.0


In [3]:
yelp_df = yelp_df[(yelp_df["stars"] == 5) | (yelp_df["stars"] == 1)]

In [5]:
X = yelp_df["text"]
y = yelp_df["stars"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
vect = CountVectorizer()

In [8]:
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
vect.get_feature_names()

['00',
 '000',
 '00a',
 '00am',
 '00pm',
 '01',
 '02',
 '03',
 '03342',
 '04',
 '05',
 '06',
 '07',
 '09',
 '0buxoc0crqjpvkezo3bqog',
 '0l',
 '10',
 '100',
 '1000',
 '1000x',
 '1001',
 '100th',
 '101',
 '102',
 '105',
 '1070',
 '108',
 '10am',
 '10ish',
 '10min',
 '10mins',
 '10minutes',
 '10pm',
 '10th',
 '10x',
 '11',
 '110',
 '1100',
 '111',
 '111th',
 '112',
 '115th',
 '118',
 '11a',
 '11am',
 '11p',
 '11pm',
 '12',
 '120',
 '128i',
 '129',
 '12am',
 '12oz',
 '12pm',
 '12th',
 '13',
 '14',
 '140',
 '147',
 '14lbs',
 '15',
 '150',
 '1500',
 '150mm',
 '15am',
 '15mins',
 '15pm',
 '15th',
 '16',
 '160',
 '165',
 '169',
 '16th',
 '17',
 '17p',
 '18',
 '180',
 '18th',
 '19',
 '1900',
 '1913',
 '1928',
 '1929',
 '1930s',
 '1940',
 '1952',
 '1955',
 '1956',
 '1960',
 '1961',
 '1969',
 '1970',
 '1980',
 '1980s',
 '1987',
 '1990s',
 '1992',
 '1995',
 '1996',
 '1998',
 '1999',
 '19th',
 '1cent',
 '1k',
 '1p',
 '1pm',
 '1st',
 '20',
 '200',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007'

In [10]:
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

In [11]:
X_train_dtm

<3064x16825 sparse matrix of type '<class 'numpy.int64'>'
	with 237720 stored elements in Compressed Sparse Row format>

In [12]:
X_train_df = pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

In [13]:
nb = MultinomialNB()

In [14]:
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
y_pred_class = nb.predict(X_test_dtm)

In [16]:
y_score = metrics.accuracy_score(y_test, y_pred_class)
y_score

0.91878669275929548

In [17]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_class)
print(metrics.confusion_matrix(y_test, y_pred_class))

[[126  58]
 [ 25 813]]


In [18]:
# null accuracy
most_common = max(y_test.value_counts())
total = sum(y_test.value_counts())
accuracy = most_common / total
print(accuracy)


0.819960861057


In [19]:
FP = confusion_matrix[0,1]
FN = confusion_matrix[1,0]

In [20]:
FP, FN

(58, 25)

In [21]:
nb.feature_count_.shape

(2, 16825)

In [22]:
nb.feature_count_

array([[ 26.,   4.,   1., ...,   0.,   0.,   0.],
       [ 39.,   5.,   0., ...,   1.,   1.,   1.]])

In [38]:
# False positives
fn = X_test[y_test > y_pred_class]
# False Negatives
fp = X_test[y_test < y_pred_class]
# Errors are all very long.

In [37]:
fn.iloc[0]

"I now consider myself an Arizonian. If you drive a lot on the 101 or 51 like I do, you'll get your fair share of chips on your windshield. You'll also have to replace a windshield like I had to do just recently. Apparently, chips and cracking windshields  is common in Arizona. In fact, I seem to recall my insurance agent telling me that insurance companies must provide this coverage in Arizona.\n\nI had a chip repaired about a year ago near the very bottom of the windshield. Just recently a small, very fine crack started traveling north on the windshield from the repaired chip (a different vendor repaired the chip). I called these guys over to my house and they said it was too long to fix, so they replaced the whole windshield the next day.\n\nWhat great service, they come out to your residence or place of business to repair or replace your windshield."

In [34]:
fp.iloc[0]

'This has to be the worst restaurant in terms of hygiene. Two of my friends had food -poisoning after having dinner here. The food is just unhealthy with tons of oil floating on the top of curries, and I am not sure if any health/hygiene code is followed here. \nThe service is poor and the information on its website is incorrect, the owner does not allow dine-in after 9 or 10 even though it says that the restaurant is open till 11. \n\nOne night I saw the owner cleaning the place without gloves and she was nice enough to give us a to-go parcel without cleaning her hands (great example to the servers!). I had a peek inside the kitchen when the door was ajar, and it definitely looked dirty.\n\nI have been a lot of hole-in-the-wall places around this restaurant, including Haji Baba, the Vietnamese place and others, but neither any of my friends nor I have fallen sick coz of the food. If you need a spicy-food fix, i strongly recommend you do not try this place, lest you want a visit to the

In [42]:
X_train_tokens = vect.get_feature_names()
X_test[y_test == 5]
print(X_train_tokens[:10])
tokens = pd.DataFrame({'token':X_train_tokens, '5-star':ham_token_count, '1-star':spam_token_count}).set_index('token')

['00', '000', '00a', '00am', '00pm', '01', '02', '03', '03342', '04']
