In [1]:
import os
import sys
sys.path.append(os.pardir)

In [2]:
import datetime
import pandas as pd
from sklearn import preprocessing
from credible import connectors

In [3]:
engine = connectors.connect_to_sqlite()

In [4]:
%%time
reviews = pd.read_sql_table('reviews', engine)

CPU times: user 50.8 s, sys: 19.5 s, total: 1min 10s
Wall time: 1min 45s


In [5]:
reviews.sample(2)

Unnamed: 0,_id,review_id,business_id,user_id,stars,date,text,useful,funny,cool
3709390,3709391,p9VaEUU2AHuvOImSP4HxMg,omKLqaC3C8u_-UNf-oaxUg,zl4T9WtZi0TiIo2RFvc4Zw,5,2017-03-25 20:48:11,These guys are great to work with. They do su...,1,0,0
1335044,1335045,b4dRgBXwuJ32nLRsXidTdg,nHejIvfCoRylcrr6JXakIg,c7tkuOoLCrBzfkdC1bvwLQ,5,2017-07-08 18:12:20,Love this place. Had my daughter's bday here. ...,1,0,0


In [6]:
X = reviews.text.sample(10000)

## Sklearn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,3), max_df=0.95, min_df=0.1, stop_words='english')
cv.fit_transform(X)

In [None]:
hasher = HashingVectorizer(
    stop_words='english', alternate_sign=False,
    n_features=2**10,
    norm=None, binary=False)
hasher.fit_transform(X)

In [None]:
analyzer = hasher.build_analyzer()

In [None]:
print(analyzer(reviews.text.sample(1).iloc[0]))

In [None]:
tfidf = TfidfVectorizer(max_df=0.95, max_features=None, ngram_range=(1, 3),
                                 min_df=0.1, stop_words='english',
                                 use_idf=True)
tfidf.fit_transform(X)

In [None]:
print(tfidf.vocabulary_)

## Spacy

In [None]:
sample = nlp(reviews.text.sample(1).iloc[0])

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
aa = pd.DataFrame()
for token in sample:
    aa.append({'text': token.text, 'lemma': token.lemma_, 
               'pos': token.pos_, 'tag': token.tag_, 
               'dep' : token.dep_, 'shape': token.shape_, 
               'is_alpha': token.is_alpha, 'is_stop': token.is_stop}, ignore_index=True)

In [None]:
aa.head()

In [None]:
{'text': token.text, 'lemma': token.lemma_, 
               'pos': token.pos_, 'tag': token.tag_, 
               'dep' : token.dep_, 'shape': token.shape_, 
               'is_alpha': token.is_alpha, 'is_stop': token.is_stop}

In [None]:
for chunk in sample.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

In [None]:
for ent in sample.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
from spacy import displacy
displacy.render(sample, style="ent", jupyter=True)

In [None]:
#displacy.render(sample, style='dep', jupyter=True)

## Feature Creation

In [19]:
from textblob import TextBlob
import spacy

from textblob.np_extractors import ConllExtractor
extractor = ConllExtractor()

nlp = spacy.load("en_core_web_lg")
nlp

<spacy.lang.en.English at 0x1da1314d0>

In [22]:
blob = TextBlob(X.iloc[1])
blob

TextBlob("Awful. First off our server seemed like she hates her life. Terrible service. Second their prices are very over priced, for food our server confirmed they don't make in house! The lobster she said is already in the ravioli that they don't make, she said their bread isn't even made in house so she couldn't tell me if there was dairy in it! An Italian restaurant who doesn't even make their own bread!! What a joke. We had to send back the first pasta they brought out because it looked like a frozen microwave dinner on a plate. No wonder they are on Groupon. Stay away. Bad food, bad service.")

In [39]:
blob.words[10].spellcheck()

[('life', 1.0)]

In [40]:
blob.polarity

-0.09479166666666665

In [33]:
blob.subjectivity

0.525

In [26]:
from textblob.np_extractors import ConllExtractor
extractor = ConllExtractor()
blob.noun_phrases

WordList(['awful', 'terrible', 'italian restaurant', 'own bread', 'frozen microwave dinner', 'groupon', 'bad service'])

In [13]:
doc = nlp(X.iloc[1])
doc

Awful. First off our server seemed like she hates her life. Terrible service. Second their prices are very over priced, for food our server confirmed they don't make in house! The lobster she said is already in the ravioli that they don't make, she said their bread isn't even made in house so she couldn't tell me if there was dairy in it! An Italian restaurant who doesn't even make their own bread!! What a joke. We had to send back the first pasta they brought out because it looked like a frozen microwave dinner on a plate. No wonder they are on Groupon. Stay away. Bad food, bad service.

In [16]:
doc.lang_

'en'

In [18]:
doc.sentiment

0.0