# Dataset Overview (for Amazon Reviews, Essays)

In [4]:
import pandas as pd
import numpy as np
import re

reviews_parent_dir = 'datasets/reviews/'
essays_parent_dir = 'datasets/asap-aes/'

def load_reviews(files=None, nrows_per_type=None):
    if not files:
        files = [
            'books-5core.json',
            'cellphones_and_accessories-5core.json',
            'office_products-5core.json',
            'tools_and_home_improvement-5core.json'
        ]
    dfs = []
    for f in files:
        df = pd.read_json(reviews_parent_dir + f, lines=True, nrows=nrows_per_type)
        df['type'] = f.split('-')[0]
        dfs.append(df)
    return pd.concat(dfs).reset_index()

def load_essays(train=True, valid=True, test=True):
    files = []
    if train: files.append('train_set.json')
    if valid: files.append('valid_set.json')
    if test:  files.append('test_set.json')
    return (pd.read_json(essays_parent_dir + f, lines=False) for f in files)

# Amazon Reviews

In [2]:
reviews_data = load_reviews(nrows_per_type=1000)

In [3]:
reviews_data.head()

Unnamed: 0,index,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image,type
0,0,5,67.0,True,"09 18, 1999",AAP7PPBU72QFM,151004714,{'Format:': ' Hardcover'},D. C. Carrad,This is the best novel I have read in 2 or 3 y...,A star is born,937612800,,books
1,1,3,5.0,True,"10 23, 2013",A2E168DTVGE6SV,151004714,{'Format:': ' Kindle Edition'},Evy,"Pages and pages of introspection, in the style...",A stream of consciousness novel,1382486400,,books
2,2,5,4.0,False,"09 2, 2008",A1ER5AYS3FQ9O3,151004714,{'Format:': ' Paperback'},Kcorn,This is the kind of novel to read when you hav...,I'm a huge fan of the author and this one did ...,1220313600,,books
3,3,5,13.0,False,"09 4, 2000",A1T17LMQABMBN5,151004714,{'Format:': ' Hardcover'},Caf Girl Writes,What gorgeous language! What an incredible wri...,The most beautiful book I have ever read!,968025600,,books
4,4,3,8.0,True,"02 4, 2000",A3QHJ0FXK33OBE,151004714,{'Format:': ' Hardcover'},W. Shane Schmidt,I was taken in by reviews that compared this b...,A dissenting view--In part.,949622400,,books


### Sample Reviews

In [5]:
def show_reviews(nsample=1):
    for group in reviews_data.groupby('type'):
        group = group[1].reset_index()
        print('---------------------------------- ({t}) --------------------------------'.format(t=group['type'][0]))
        for _, row in group.sample(n=nsample).iterrows():
            print('\n(Score: {s})  {r}\n'.format(s=row['overall'], r=row['reviewText']))
            print("---------------------------------------------------------------------------")

show_reviews(2)

---------------------------------- (books) --------------------------------

(Score: 5)  Well constructed wall mount for TV sets up to 37 inches. The mounting  plate has numerous holes for varius sets. Plenty of extra hardware provided. Price was about 1/8th of what it would cost on the open market. I recommend it highly.

---------------------------------------------------------------------------

(Score: 5)  Purchased for my sister to hang her TV on a wall and she and her boyfriend reported back to me that mount works great with the 42" Vizio TV she got for Christmas and was up securely in no time.  I looked around in the retail stores and found that this mount was of similar quality at a much lower price.  What can I say, it does what it is supposed to do, was easy to setup, and is of good value.

---------------------------------------------------------------------------
---------------------------------- (cellphones_and_accessories) --------------------------------

(Score: 5)  My

### Summary Statistics

In [6]:
for dataset in ['all', *reviews_data['type'].unique()]:
    data = reviews_data if (dataset=='all') else reviews_data[reviews_data['type']==dataset]
    
    word_tokens = set()  # lowercase only
    char_tokens = set()  # lowercase only
    sum_rev_len = 0
    for text in data['reviewText']:
        if type(text) == str:
            sum_rev_len += len(text)
            for token in re.split(r'\s{1,}', text):  word_tokens.add(token)
            for c in text:  char_tokens.add(c)
    print("------------- ({s}) --------------".format(s=dataset))
    print("# Reviews:  ", len(data))
    print("# Unique words:  ", len(word_tokens))
    print("# Unique characters:  ", len(char_tokens))
    print("Average Review Length:  ", sum_rev_len / len(data), "chars")
    print()
print()
print("------------------- Score Summaries -------------------")
print(reviews_data.groupby('type')[['overall', 'vote']].mean())
print()
print("Possible values for 'overall':\t", sorted(reviews_data['overall'].unique()))
print("Range of values for 'vote':\t[{mn}, {mx}]".format(mn=reviews_data['vote'].min(), mx=reviews_data['vote'].max()))

------------- (all) --------------
# Reviews:   4000
# Unique words:   22045
# Unique characters:   92
Average Review Length:   296.8255 chars

------------- (books) --------------
# Reviews:   1000
# Unique words:   8777
# Unique characters:   90
Average Review Length:   311.685 chars

------------- (cellphones_and_accessories) --------------
# Reviews:   1000
# Unique words:   8921
# Unique characters:   91
Average Review Length:   303.886 chars

------------- (office_products) --------------
# Reviews:   1000
# Unique words:   8803
# Unique characters:   91
Average Review Length:   286.033 chars

------------- (tools_and_home_improvement) --------------
# Reviews:   1000
# Unique words:   6721
# Unique characters:   89
Average Review Length:   285.698 chars


------------------- Score Summaries -------------------
                            overall       vote
type                                          
books                         4.421  10.009615
cellphones_and_accessories    

# Essays

This dataset contains 8 different essay sets, each associated with a particular prompt.
* Personally identifying information has been replaced with NER (Named Entity Recognition) tags [Ex: "Person", "Organization", etc.]

## Loading (and cleaning) Data

* All of the score columns were of datatype ('object'), and missing values were ('').

 **Steps for Cleaning Data:**
 
   1) Replace '' with nans (NULLs)
 
   2) Convert score columns to int64

In [5]:
essays_train, essays_valid, essays_test = load_essays()  # (train, valid, test)

for df in [essays_train, essays_valid, essays_test]:
    df.replace('', np.nan, inplace=True)
    for col in (c for c in df.columns if c not in ['essay_id', 'essay_set', 'essay']):
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')

### Train:

In [10]:
essays_train.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


### Valid:

In [10]:
essays_valid.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_predictionid,domain2_predictionid
0,1788,1,"Dear @ORGANIZATION1, @CAPS1 more and more peop...",1788,
1,1789,1,Dear @LOCATION1 Time @CAPS1 me tell you what I...,1789,
2,1790,1,"Dear Local newspaper, Have you been spending a...",1790,
3,1791,1,"Dear Readers, @CAPS1 you imagine how life woul...",1791,
4,1792,1,"Dear newspaper, I strongly believe that comput...",1792,


### Test:

In [11]:
essays_test.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_predictionid,domain2_predictionid
0,2383,1.0,I believe that computers have a positive effec...,2383.0,
1,2384,1.0,"Dear @CAPS1, I know some problems have came up...",2384.0,
2,2385,1.0,"Dear to whom it @MONTH1 concern, Computers are...",2385.0,
3,2386,1.0,"Dear @CAPS1 @CAPS2, @CAPS3 has come to my atte...",2386.0,
4,2387,1.0,"Dear Local newspaper, I think that people have...",2387.0,


### Ratings Overview (Train)

In [95]:
score_columns = [c for c in essays_train.columns if c not in ['essay_id', 'essay_set', 'essay']]
essays_train[score_columns].describe()

Unnamed: 0,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,rater1_trait1,rater1_trait2,rater1_trait3,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
count,12977.0,12977.0,128.0,12977.0,1800.0,1800.0,1800.0,2292.0,2292.0,2292.0,...,2292.0,2292.0,723.0,723.0,128.0,128.0,128.0,128.0,128.0,128.0
mean,4.12684,4.137089,37.828125,6.799723,3.333889,3.330556,3.333889,2.444154,2.557592,2.606457,...,2.635689,2.710297,3.777317,3.589212,3.945312,3.890625,4.078125,3.992188,3.84375,3.617188
std,4.212537,4.26432,5.240829,8.970558,0.729103,0.726807,0.729103,1.21173,1.061076,1.098196,...,1.142566,1.045795,0.689401,0.693256,0.643668,0.63039,0.622535,0.509687,0.538845,0.603417
min,0.0,0.0,20.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,2.0,2.0,2.0,3.0,2.0,2.0
25%,2.0,2.0,36.0,2.0,3.0,3.0,3.0,2.0,2.0,2.0,...,2.0,2.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0
50%,3.0,3.0,40.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,...,2.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
75%,4.0,4.0,40.0,8.0,4.0,4.0,4.0,3.0,3.0,4.0,...,4.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
max,30.0,30.0,50.0,60.0,4.0,4.0,4.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,5.0,5.0


### Ratings Overview (by Group)

In [53]:
score_columns = [c for c in essays_train.columns if c not in ['essay_id', 'essay']]
metrics = ['count', 'mean', 'min', 'max']
op_dict = {
    'count': lambda x: x.astype(int),
    'min': lambda x: x.astype(int),
    'max': lambda x: x.astype(int),
    'mean': lambda x: np.round_(x, 2)
}
def reformat(x, metric):
    x = np.array(x)
    return "None" if pd.isnull(x) else op_dict[metric](x)

print("\t\t\tSet 1\t Set 2\t Set 3\t Set 4\t Set 5\t Set 6\t Set 7\t Set 8")
print("\t\t\t---------------------------------------------------------------")
current_column = None
for entry in essays_train[score_columns].groupby('essay_set').describe():
    if current_column != entry[0]:
        print()
        current_column = entry[0]
    if entry[1] in metrics:
        score_params = {'s'+str(set_id): reformat(df[entry][set_id], entry[1]) for set_id in range(1, 9)}
        print(("{t} ({m}):\t{s1} \t{s2} \t{s3} \t{s4} \t{s5} \t{s6} \t{s7} \t{s8}").format(
            t=entry[0], m=entry[1], **score_params))
        
# NOTE:  'count' is how many non-NULL rows
        
# -------------------- OBSERVATIONS --------------------
#
# -  All Sets Have:   'rater1_domain1', 'rater2_domain1', and 'domain1_score'
# -  Set 8 has all scores except in domain 2
#
#    Given that only these three columns ('rater1_domain1', 'rater2_domain1', 'domain1_score') have values in every
#    set category, it's best to choose the column relating to the overall score among these three ('domain1_score')
#    as the score we'll use for our label.

			Set 1	 Set 2	 Set 3	 Set 4	 Set 5	 Set 6	 Set 7	 Set 8
			---------------------------------------------------------------

rater1_domain1 (count):	1783 	1800 	1726 	1771 	1805 	1800 	1569 	723
rater1_domain1 (mean):	4.26 	3.42 	1.74 	1.32 	2.22 	2.56 	8.02 	18.34
rater1_domain1 (min):	1 	1 	0 	0 	0 	0 	0 	5
rater1_domain1 (max):	6 	6 	3 	3 	4 	4 	12 	30

rater2_domain1 (count):	1783 	1800 	1726 	1771 	1805 	1800 	1569 	723
rater2_domain1 (mean):	4.27 	3.44 	1.7 	1.32 	2.22 	2.55 	8.04 	18.56
rater2_domain1 (min):	1 	1 	0 	0 	0 	0 	0 	5
rater2_domain1 (max):	6 	6 	3 	3 	4 	4 	12 	30

rater3_domain1 (count):	0 	0 	0 	0 	0 	0 	0 	128
rater3_domain1 (mean):	None 	None 	None 	None 	None 	None 	None 	37.83
rater3_domain1 (min):	None 	None 	None 	None 	None 	None 	None 	20
rater3_domain1 (max):	None 	None 	None 	None 	None 	None 	None 	50

domain1_score (count):	1783 	1800 	1726 	1771 	1805 	1800 	1569 	723
domain1_score (mean):	8.53 	3.42 	1.85 	1.43 	2.41 	2.72 	16.06 	36.95
domain1_scor

### Sample Essays:

In [12]:
def show_essays(nsample=1, exclude=None):
    if not exclude: exclude = []
    for group in essays_train.groupby('essay_set'):
        group = group[1].reset_index()
        if not group['essay_set'][0] in exclude:
            print('---------------------------------- (Set {t}) --------------------------------'.format(t=group['essay_set'][0]))
            for _, row in group.sample(n=nsample).iterrows():
                score = row['domain1_score']
                print('\n(Score: {s})  {r}\n'.format(s=score, r=row['essay']))
                print("---------------------------------------------------------------------------")
                
show_essays()

---------------------------------- (Set 1) --------------------------------

(Score: 8)  Guess what! Do you like to use computers well I do and I think it benfits society for a couple of reasons. It is good to use to get information on things, to meet people and just to expose the internet and if you keep on reading @CAPS1 ill tell you why think that. One reason I think it benfits society because how in this lifetime you need to know what is going on in the world. So instead of you always watching the news or reading newspaper. You could go on the internet and see what is going on from the internet. Thats why I think it benfits society today. A second reason is haveing a computers poeple get to talk over the web and meet new poeple. Being able to talk online to people like faimly and your friends. That you havent seen or heard from in a long time. And you don't have thier number you will tot alk to them during they are on the internet. A third reason is haveing a computer and is always

### Summary Statistics

In [55]:
for dataset in ['all', *essays_train['essay_set'].unique()]:
    data = essays_train if (dataset=='all') else essays_train[essays_train['essay_set']==dataset]
    
    word_tokens = set()  # lowercase only
    char_tokens = set()  # lowercase only
    sum_rev_len = 0
    for text in data['essay']:
        sum_rev_len += len(text)
        for token in re.split(r'\s{1,}', text):  word_tokens.add(token)
        for c in text:  char_tokens.add(c)
    print("------------- (Set {s}) --------------".format(s=dataset))
    print("# Essays:  ", len(data))
    print("# Unique words:  ", len(word_tokens))
    print("# Unique characters:  ", len(char_tokens))
    print("Average Essay Length:  ", sum_rev_len / len(data), "chars")
    print()

------------- (Set all) --------------
# Essays:   12978
# Unique words:   78395
# Unique characters:   101
Average Essay Length:   1215.6346894744952 chars

------------- (Set 1) --------------
# Essays:   1783
# Unique words:   27282
# Unique characters:   91
Average Essay Length:   2029.3337072349973 chars

------------- (Set 2) --------------
# Essays:   1800
# Unique words:   24810
# Unique characters:   86
Average Essay Length:   2097.027777777778 chars

------------- (Set 3) --------------
# Essays:   1726
# Unique words:   11074
# Unique characters:   80
Average Essay Length:   586.9038238702202 chars

------------- (Set 4) --------------
# Essays:   1772
# Unique words:   8339
# Unique characters:   84
Average Essay Length:   509.47968397291197 chars

------------- (Set 5) --------------
# Essays:   1805
# Unique words:   9118
# Unique characters:   82
Average Essay Length:   685.3468144044322 chars

------------- (Set 6) --------------
# Essays:   1800
# Unique words:   9729
