In [1]:
import os
import sys
sys.path.append(os.pardir)

In [2]:
import datetime
import pandas as pd
from sklearn import preprocessing
from credible import connectors
from credible.utils import progressbar2
from credible.text_features import Nlp, nlp

In [3]:
engine = connectors.connect_to_sqlite()

In [4]:
%%time
reviews = pd.read_sql_table('reviews', engine)

CPU times: user 53.5 s, sys: 24.5 s, total: 1min 18s
Wall time: 1min 43s


## Sample

In [5]:
sample = reviews.text.sample().iloc[0]

In [6]:
rf = Nlp(sample, nlp_object=nlp)
rf

<Nlp lang=en sc: 10 wc: 46 rl: 0.8913043478260869 text: Great beers--so many...>

In [7]:
rf.validate_text()

True

In [8]:
rf.detect_language()

'en'

In [9]:
rf.sentiment_polarity()

0.3038461538461539

In [10]:
rf.sentiment_subjectivity()

0.6568376068376067

In [11]:
rf.count_sentences()

10

In [12]:
rf.count_words()

46

In [13]:
rf.count_noun_phrases()

17

In [14]:
rf.count_pos()

'[{"ADJ": 17}, {"NOUN": 19}, {"PUNCT": 18}, {"ADV": 4}, {"PART": 4}, {"VERB": 3}, {"ADP": 6}, {"CCONJ": 3}, {"AUX": 8}, {"PRON": 4}, {"DET": 8}, {"SCONJ": 1}, {"SPACE": 5}, {"PROPN": 5}, {"INTJ": 2}]'

In [15]:
rf.avg_len_sentences()

10.7

In [16]:
rf.avg_len_words()

5.565217391304348

In [17]:
rf.ratio_lexical()

0.8913043478260869

In [18]:
rf.ratio_content()

0.10869565217391304

In [19]:
rf.get_noun_phrases()

'["great beers", "helpful servers", "taste test", "friendly", "knowledgeable servers", "huge", "taco sampler", "orange peel chicken", "bit pricey", "chain restaurant", "been", "vegas", "cities ...", "fine ...", "wow", "dandy service", "great beers"]'

## Create Review Linguistics Table

In [None]:
tot = len(reviews)
df = pd.DataFrame()

for ind, row in reviews.iterrows():
    progressbar2(tot - 1, ind, f't: {tot} c: {ind} df: {df.shape}')
    
    check_if_exists = engine.execute(
        "select * from review_linguistics where review_id = '{}'".format(
            row.review_id)).scalar() is None
    
    if check_if_exists:
        rf = Nlp(text=row.text, nlp_object=nlp)
        if rf.validate_text() and rf.detect_language() == 'en':
            df_review = pd.DataFrame([{
                '_id': ind + 1,
                'review_id': row.review_id,
                'sentiment_polarity': rf.sentiment_polarity(),
                'sentiment_subjectivity': rf.sentiment_subjectivity(),
                'ratio_content': rf.ratio_content(),
                'ratio_lexical': rf.ratio_lexical(),
                'avg_len_sentences': rf.avg_len_sentences(),
                'avg_len_words': rf.avg_len_words(),
                'count_noun_phrases': rf.count_noun_phrases(),
                'count_words': rf.count_words(),
                'count_sentences': rf.count_sentences(),
                'count_pos': rf.count_pos(),
                'noun_phrases': rf.get_noun_phrases(),
            }])
            df_review.to_sql(
                'review_linguistics', con=engine, index=False, if_exists='append')
            df = df.append(df_review)

t: 6685900 c: 20165 df: (1941, 13) [--------------------] 0% 

In [None]:
df.shape

In [None]:
df.sample(2)