In [2]:
import nltk
import numpy as np
import pandas as pd
import connect_aws_db as cadb

In [42]:
import string
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
engine = cadb.connect_aws_db(write_unicode=True)

In [7]:
cmd = "SELECT review_rating, review_text FROM bf_reviews"

In [8]:
bfdf = pd.read_sql_query(cmd, engine)

In [9]:
bfdf

Unnamed: 0,review_rating,review_text
0,5,Really nice property. Great walking areas for ...
1,5,We've stayed at this La Quinta several times w...
2,5,This place was awesome!! The entire staff was ...
3,5,We've stayed at this hotel two times with our ...
4,4,My room while a bit small was very clean and t...
5,3,The big gentleman who manages the front desk i...
6,1,"I was driving from So Cal to Houston, TX for t..."
7,2,the hotel was fine but it is all hard scape ar...
8,4,Hotel has a grass area and dog potty station. ...
9,3,My room in general was in need of a lot of gen...


In [45]:
bfreviews = ('').join(bfdf['review_text'].values)

In [46]:
bfreviews[:500]

u"Really nice property. Great walking areas for the pets. Centrally located. A lot of swimming pools to use. A water park for the kids. Pets are not allowed in the water park of course. The casita room was nice, but there was no table, making it hard to eat anything with the dogs there. Casita rooms are small. Overall, very nice!We've stayed at this La Quinta several times with our two mini schnauzers. The staff is all very friendly. Rooms were clean but not overly fancy - we felt like we didn't n"

In [47]:
bftokens = nltk.wordpunct_tokenize(bfreviews)

In [48]:
bftext = nltk.Text(bftokens)

In [50]:
bfwords = [w.lower() for w in bftext]

In [51]:
bfvocab = sorted(set(bfwords))

In [52]:
len(bfvocab)

412

### TFIDF on BF Reviews

The number of words in the bringfido sample corpus:

In [54]:
len(bfwords)

1069

In [55]:
count_vect = CountVectorizer()
bf_train_counts = count_vect.fit_transform(bfwords)
bf_train_counts.shape


(1069, 394)

In [56]:
tfidf_transformer = TfidfTransformer()
bf_train_tfidf = tfidf_transformer.fit_transform(bf_train_counts)
bf_train_tfidf.shape

(1069, 394)

### TFIDF on Yelp Reviews

First I want to import all of the Yelp review text, and perform a TFIDF on those reviews:

In [57]:
cmd = "SELECT review_rating, review_text FROM yelp_reviews"

In [58]:
yelpdf = pd.read_sql_query(cmd, engine)

In [59]:
len(yelpdf)

6263

In [60]:
yelpreviews = ('').join(yelpdf['review_text'].values)
yelptokens = nltk.wordpunct_tokenize(yelpreviews)
yelptext = nltk.Text(yelptokens)
yelpwords = [w.lower() for w in yelptext]

In [61]:
yelp_count_vect = CountVectorizer()
yelp_train_counts = yelp_count_vect.fit_transform(yelpwords)
yelp_train_counts.shape


(1244957, 22909)

In [62]:
yelp_tfidf_transformer = TfidfTransformer()
yelp_train_tfidf = yelp_tfidf_transformer.fit_transform(yelp_train_counts)
yelp_train_tfidf.shape

(1244957, 22909)

In [64]:
from sklearn.metrics import jaccard_similarity_score

In [66]:
#jaccard_similarity_score(bf_train_tfidf, yelp_train_tfidf)

### Examples

In [22]:
path = './tf-idf'
token_dict = {}


def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [24]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)




In [None]:
for dirpath, dirs, files in os.walk(path):
    for f in files:
        fname = os.path.join(dirpath, f)
        print "fname=", fname
        with open(fname) as pearl:
            text = pearl.read()
            token_dict[f] = text.lower().translate(None, string.punctuation)

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

str = 'all great and precious things are lonely.'
response = tfidf.transform([str])
print response

feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
    print feature_names[col], ' - ', response[0, col]

In [43]:
#twenty_train.data

In [None]:
yelp_count_vect = CountVectorizer()
yelp_train_counts = yelp_count_vect.fit_transform(twenty_train.data)
X_train_counts.shape


In [26]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape


(2257, 35788)

In [28]:
X_train_counts[0:3, 0:3]

<3x3 sparse matrix of type '<type 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>