This notebook extracts primary features from the review text. These features are on simple summary statistics of the review text or aspects of the documents that will be removed during tokenisation, such as:
- Raw character count
- Raw word count
- Number of punctuation marks
- Number of capital letters
- Number of numeric characters

In [48]:
from nltk.corpus import stopwords
stops = stopwords.words('english')
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [49]:
from string import punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [45]:
def _word_count(text):
    return len(text.split())

def _char_count(text):
    return len(text)

def _avg_word(text):
    words = text.split() 
    return sum(len(word) for word in words)/len(words)

def _stop_count(text):
    return len([word for word in text.split() if word in stops])

def _num_count(text):
    return len([word for word in text.split() if word.isdigit()])

def _upper_count(text):
    return len([word for word in text.split() if word.isupper()])

def _punc_count(text):
    return len([word for word in text.split() if word in punctuation])
    

In [70]:
reviews = io.load('reviews_preproc')
reviews['text_word_count'] = reviews['review_text'].apply(_word_count)
reviews['text_char_count'] = reviews['review_text'].apply(_char_count)
reviews['text_avg_word'] = reviews['review_text'].apply(_avg_word)
reviews['text_stop_count'] = reviews['review_text'].apply(_stop_count)
reviews['text_stop_freq'] = reviews['text_stop_count'] / reviews['text_word_count']
reviews['text_num_count'] = reviews['review_text'].apply(_num_count)
reviews['text_upper_count'] = reviews['review_text'].apply(_upper_count)
reviews['text_punc_count'] = reviews['review_text'].apply(_punc_count)
reviews.head()

2019-06-15 11:00:06,542 - kedro.io.data_catalog - INFO - Loading data from `reviews_preproc` (PickleLocalDataSet)...


Unnamed: 0_level_0,product_id,author_age,review_title,review_text,star_rating,recommend_flag,upvotes,product_category_division,product_category_department,product_category_class,text_word_count,text_char_count,text_avg_word,text_stop_count,text_stop_freq,text_num_count,text_upper_count,text_punc_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,intimates,intimate,intimates,8,53,5.75,2,0.25,0,0,1
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,general,dresses,dresses,62,303,3.822581,30,0.483871,0,0,0
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,general,dresses,dresses,98,500,4.112245,45,0.459184,0,1,1
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,general petite,bottoms,pants,22,124,4.681818,6,0.272727,0,1,0
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,general,tops,blouses,36,192,4.361111,19,0.527778,0,0,0


In [72]:
reviews.filter(like = 'text').describe()

Unnamed: 0,text_word_count,text_char_count,text_avg_word,text_stop_count,text_stop_freq,text_num_count,text_upper_count,text_punc_count
count,22641.0,22641.0,22641.0,22641.0,22641.0,22641.0,22641.0,22641.0
mean,60.196679,308.687911,4.194436,28.999867,0.467805,0.318449,0.391105,0.197739
std,28.534612,143.940048,0.391952,15.052407,0.072077,0.728459,0.488009,0.611713
min,2.0,9.0,3.0,0.0,0.0,0.0,0.0,0.0
25%,36.0,186.0,3.945946,16.0,0.434783,0.0,0.0,0.0
50%,59.0,301.0,4.151515,28.0,0.477612,0.0,0.0,0.0
75%,88.0,459.0,4.385965,42.0,0.514563,0.0,1.0,0.0
max,115.0,508.0,8.75,68.0,0.666667,8.0,1.0,8.0


In [1]:
text_summary = io.load('text_summary')

2019-06-15 21:28:24,493 - kedro.io.data_catalog - INFO - Loading data from `text_summary` (PickleLocalDataSet)...


In [2]:
text_summary.head()

Unnamed: 0_level_0,review_text,text_word_count,text_char_count,text_avg_word,text_stop_count,text_stop_freq,text_num_count,text_upper_count,text_punc_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Absolutely wonderful - silky and sexy and comf...,8,53,5.75,2,0.25,0,0,1
1,Love this dress! it's sooo pretty. i happene...,62,303,3.822581,30,0.483871,0,0,0
2,I had such high hopes for this dress and reall...,98,500,4.112245,45,0.459184,0,1,1
3,"I love, love, love this jumpsuit. it's fun, fl...",22,124,4.681818,6,0.272727,0,1,0
4,This shirt is very flattering to all due to th...,36,192,4.361111,19,0.527778,0,0,0
