If you work locally, it might be a good idea to create symbolic link to your data directory:

```ln -s /your/data/directory data```

This will create a link named data in the current dir and you can use the data as it was in the directory `data`

In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
import pandas as pd

In [2]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sql_context = SQLContext(sc)

In [3]:
DATA_DIR = './data/'

`index.txt` describes format of the data. It is tab delimited, each row is a separate review entry.
We define the schema of the data to facilitate reading.

__TODO__ convert date to a suitable format

__TODO__ convert vine and verified purchase to bool Type

__TODO__ category can be categorical value?

In [12]:
def read_tsv_to_pyspark_DF(filename):
    schema = StructType([
        StructField('marketplace', StringType(), True), #2 letter country code
        StructField('customer_id', IntegerType(), True), #author identifier
        StructField('review_id', StringType(), True), #unique review ID
        StructField('product_id', StringType(), True), # unique product ID
        StructField('product_parent', IntegerType(), True), # product identifier to be used to aggregate reviews for a product
        StructField('product_title', StringType(), True),
        StructField('product_category', StringType(), True),
        StructField('star_rating', IntegerType(), True), # 1-5 star rating 
        StructField('helpful_votes', IntegerType(), True), # positive votes for the review
        StructField('total_votes', IntegerType(), True), # total votes for the review
        StructField('vine', StringType(), True), # review is part of Vine Program
        StructField('verfied_purchase', StringType(), True), # Review is on Verified Purchase
        StructField('review_headline', StringType(), True), # title of the review
        StructField('review_body', StringType(), True), # text
        StructField('review_date', DateType(), True)]) # date of review 

    return sql_context.read.option('sep', '\t').csv(filename, schema=schema, header=True)

In [13]:
import numpy as np
def read_tsv_top_pandas(filename):
    columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title',
          'product_category', 'star_rating', 'helpful_votes', 'total_votes', 'review_headline']
    dtypes = {'star_rating': np.uint8 }
    df = pd.read_csv(filename, delimiter='\t', error_bad_lines=False, usecols=columns)
    df['star_rating'] = pd.to_numeric(df['star_rating'],errors='coerce')  # 'coerce' results in NaN for entries that can't be converted
    df['helpful_votes'] = pd.to_numeric(df['star_rating'],errors='coerce')
    df['total_votes'] = pd.to_numeric(df['star_rating'],errors='coerce')

    return df.dropna()

def read_tsv_reviews_to_pandas(filename):
    return pd.read_csv(filename, delimiter='\t', error_bad_lines=False, usecols=['review_body'] )

In [14]:
sample_df = read_tsv_to_pyspark_DF('sample_us.tsv')
sample_df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verfied_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: date (nullable = true)



In [15]:
sample_pd = read_tsv_top_pandas('sample_us.tsv')
sample_pd.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,review_headline
0,US,18778586,RDIJS7QYB6XNR,B00EDBY7X8,122952789,Monopoly Junior Board Game,Toys,5,5,5,Five Stars
1,US,24769659,R36ED1U38IELG8,B00D7JFOPC,952062646,56 Pieces of Wooden Train Track Compatible wit...,Toys,5,5,5,Good quality track at excellent price
2,US,44331596,R1UE3RPRGCOLD,B002LHA74O,818126353,Super Jumbo Playing Cards by S&S Worldwide,Toys,2,2,2,Two Stars
3,US,23310293,R298788GS6I901,B00ARPLCGY,261944918,Barbie Doll and Fashions Barbie Gift Set,Toys,5,5,5,my daughter loved it and i liked the price and...
4,US,38745832,RNX4EXOBBPN5,B00UZOPOFW,717410439,Emazing Lights eLite Flow Glow Sticks - Spinni...,Toys,1,1,1,DONT BUY THESE!


In [16]:
import pyarrow
sample_pd.to_parquet('df.parquet.gzip', compression='gzip')

In [17]:
df_books = read_tsv_to_pyspark_DF(DATA_DIR+'amazon_reviews_us_Books_v1_00.tsv.gz')
df_books1 = read_tsv_to_pyspark_DF(DATA_DIR+'amazon_reviews_us_Books_v1_01.tsv.gz')
df_books2 = read_tsv_to_pyspark_DF(DATA_DIR+'amazon_reviews_us_Books_v1_02.tsv.gz')

In [9]:
df_books.count()

10319090

In [None]:
df_books.filter("product_category == 'Books'").count()

In [35]:
columns = ['customer_id', 'review_id', 'product_parent',
          'product_category', 'star_rating', 'helpful_votes', 'total_votes', 'review_date']
df_filtered = df_books.select(columns).dropna()
df_filtered1 = df_books.select(columns).dropna()
df_filtered2 = df_books.select(columns).dropna()

df = df_filtered2.union(df_filtered1.union(df_filtered))

In [36]:
df.show(10)

+-----------+--------------+--------------+----------------+-----------+-------------+-----------+-----------+
|customer_id|     review_id|product_parent|product_category|star_rating|helpful_votes|total_votes|review_date|
+-----------+--------------+--------------+----------------+-----------+-------------+-----------+-----------+
|   25933450| RJOVP071AVAJO|      84656342|           Books|          5|            0|          0| 2015-08-31|
|    1801372|R1ORGBETCDW3AI|     729938122|           Books|          5|            0|          0| 2015-08-31|
|    5782091| R7TNRFQAOUTX5|     678139048|           Books|          5|            0|          0| 2015-08-31|
|   32715830|R2GANXKDIFZ6OI|     712432151|           Books|          5|            0|          0| 2015-08-31|
|   14005703|R2NYB6C3R8LVN6|     800572372|           Books|          5|            2|          2| 2015-08-31|
|   36205738|R13U5PBJI1H94K|     559876774|           Books|          2|            1|          1| 2015-08-31|
|

In [37]:
df.write.parquet('booksall.parquet.gzip', compression='gzip')

In [27]:
#df_filtered.write.parquet('books.parquet.gzip', compression='gzip')
df_filtered1.write.parquet('books1.parquet.gzip', compression='gzip')
df_filtered2.write.parquet('books2.parquet.gzip', compression='gzip')

In [None]:
books_pd = read_tsv_top_pandas(DATA_DIR+'amazon_reviews_us_Books_v1_00.tsv.gz')

In [None]:
books_pd.head()

In [None]:
books_pd.count()

In [18]:
reviews_pd = read_tsv_reviews_to_pandas(DATA_DIR+'amazon_reviews_us_Books_v1_00.tsv.gz')

In [19]:
reviews_pd.count()

review_body    10236889
dtype: int64

In [22]:
import pyarrow
books_pd.to_parquet('books.parquet.gzip', compression='gzip')

dtype('O')

In [31]:
babys_pd = read_tsv_top_pandas(DATA_DIR+'amazon_reviews_us_Baby_v1_00.tsv.gz')
babys_pd.head()

  if (yield from self.run_code(code, result)):


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,review_headline
0,US,9970739,R8EWA1OFT84NX,B00GSP5D94,329991347,Summer Infant SwaddleMe Adjustable Infant Wrap...,Baby,5.0,5.0,5.0,Great swaddled blankets
1,US,23538442,R2JWY4YRQD4FOP,B00YYDDZGU,646108902,Pacifier Clip Girl (3 Pack) Ziggy Baby 2-Sided...,Baby,5.0,5.0,5.0,Too cute and really nice
2,US,8273344,RL5ESX231LZ0B,B00BUBNZC8,642922361,Udder Covers - Breast Feeding Nursing Cover,Baby,5.0,5.0,5.0,Five Stars
3,US,24557753,RRMS9ZWJ2KD08,B00AWLZFTS,494272733,Gerber Graduates Fun Pack Utensils,Baby,5.0,5.0,5.0,Cute; wash up nicely in dishwasher.
4,US,46263340,R14I3ZG5E6S7YM,B00KM60D3Q,305813185,Summer Infant Ultra Sight Pan/Scan/Zoom Video ...,Baby,5.0,5.0,5.0,Love it!


In [4]:
import pyarrow
babys_pd.to_parquet('babys.parquet.gzip', compression='gzip')

NameError: name 'babys_pd' is not defined

In [29]:
books_pd_read = pd.read_parquet('books.parquet.gzip')

In [30]:
books_pd_read.head()

Unnamed: 0,customer_id,review_id,product_parent,product_category,star_rating,helpful_votes,total_votes,review_date
0,25933450,RJOVP071AVAJO,84656342,Books,5,0,0,2015-08-31
1,1801372,R1ORGBETCDW3AI,729938122,Books,5,0,0,2015-08-31
2,5782091,R7TNRFQAOUTX5,678139048,Books,5,0,0,2015-08-31
3,32715830,R2GANXKDIFZ6OI,712432151,Books,5,0,0,2015-08-31
4,14005703,R2NYB6C3R8LVN6,800572372,Books,5,2,2,2015-08-31


In [31]:
books_pd_read.dtypes

customer_id                  int32
review_id                   object
product_parent               int32
product_category            object
star_rating                  int32
helpful_votes                int32
total_votes                  int32
review_date         datetime64[ns]
dtype: object

In [28]:
del(books_pd_read)

In [32]:
books_pd_read1 = pd.read_parquet('books1.parquet.gzip')

In [5]:
del(books_pd_read1)

In [33]:
books_pd_read2 = pd.read_parquet('books2.parquet.gzip')

In [34]:
books_pd_read2.count()

customer_id         10316802
review_id           10316802
product_parent      10316802
product_category    10316802
star_rating         10316802
helpful_votes       10316802
total_votes         10316802
review_date         10316802
dtype: int64

In [8]:
books_pd_read2.index = books_pd_read2['product_id']

In [9]:
books_pd_read2.index.is_unique

False

In [5]:
books_all = pd.read_parquet('booksall.parquet.gzip')

In [6]:
books_all.count()

customer_id         30950406
review_id           30950406
product_parent      30950406
product_category    30950406
star_rating         30950406
helpful_votes       30950406
total_votes         30950406
review_date         30950406
dtype: int64