In [1]:
import os
import sys
sys.path.append(os.pardir)

In [2]:
import datetime
import pandas as pd
from sklearn import preprocessing
from credible import connectors
from credible.utils import progressbar2
from credible.text_features import Nlp, nlp

In [3]:
engine = connectors.connect_to_sqlite()

In [4]:
%%time
reviews = pd.read_sql_table('reviews', engine)

CPU times: user 49.8 s, sys: 20.2 s, total: 1min 10s
Wall time: 1min 35s


## Sample

In [5]:
sample = reviews.text.sample().iloc[0]

In [6]:
rf = Nlp(sample, nlp_object=nlp)
rf

<Nlp lang=en sc: 3 wc: 31 rl: 0.8387096774193549 text: great price checked ...>

In [7]:
rf.validate_text()

True

In [8]:
rf.detect_language()

'en'

In [9]:
rf.sentiment_polarity()

0.5966666666666667

In [10]:
rf.sentiment_subjectivity()

0.5666666666666667

In [11]:
rf.count_sentences()

3

In [12]:
rf.count_words()

31

In [13]:
rf.count_noun_phrases()

9

In [14]:
rf.count_pos()

'[{"ADJ": 7}, {"NOUN": 10}, {"VERB": 10}, {"PRON": 8}, {"ADP": 5}, {"PROPN": 3}, {"AUX": 8}, {"ADV": 3}, {"PART": 2}, {"DET": 3}, {"SPACE": 1}, {"CCONJ": 1}]'

In [15]:
rf.avg_len_sentences()

20.333333333333332

In [16]:
rf.avg_len_words()

5.451612903225806

In [17]:
rf.ratio_lexical()

0.8387096774193549

In [18]:
rf.ratio_content()

0.03225806451612903

In [19]:
rf.get_noun_phrases()

'["great price", "online dealer", "lg", "online shipping", "fast tv", "white glove installation service", "n\'t pay", "tv", "online dealer"]'

## Create Review Linguistics Table

In [None]:
tot = len(reviews)
df = pd.DataFrame()

for ind, row in reviews.iterrows():
    progressbar2(tot - 1, ind, f't: {tot} c: {ind} df: {df.shape}')
    
    check_if_exists = engine.execute(
        "select * from review_linguistics where review_id = '{}'".format(
            row.review_id)).scalar() is None
    
    if check_if_exists:
        rf = Nlp(text=row.text, nlp_object=nlp)
        if rf.validate_text() and rf.detect_language() == 'en':
            df_review = pd.DataFrame([{
                '_id': ind + 1,
                'review_id': row.review_id,
                'sentiment_polarity': rf.sentiment_polarity(),
                'sentiment_subjectivity': rf.sentiment_subjectivity(),
                'ratio_content': rf.ratio_content(),
                'ratio_lexical': rf.ratio_lexical(),
                'avg_len_sentences': rf.avg_len_sentences(),
                'avg_len_words': rf.avg_len_words(),
                'count_noun_phrases': rf.count_noun_phrases(),
                'count_words': rf.count_words(),
                'count_sentences': rf.count_sentences(),
                'count_pos': rf.count_pos(),
                'noun_phrases': rf.get_noun_phrases(),
            }])
            df_review.to_sql(
                'review_linguistics', con=engine, index=False, if_exists='append')
            df = df.append(df_review)

t: 6685900 c: 204598 df: (177300, 13) [#-------------------] 3% 

In [None]:
df.shape

In [None]:
df.sample(2)