# Apache Pig

In [4]:
from nose.tools import assert_equal, assert_almost_equal

## Raw Data Preview

In [5]:
!head -5 $HOME/book-crossing/BX-Book-Ratings.csv

"User-ID";"ISBN";"Book-Rating"
"276725";"034545104X";"0"
"276726";"0155061224";"5"
"276727";"0446520802";"0"
"276729";"052165615X";"3"


In [6]:
!head -5 $HOME/book-crossing/BX-Books.csv

"ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"
"0195153448";"Classical Mythology";"Mark P. O. Morford";"2002";"Oxford University Press";"http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg";"http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg";"http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg"
"0002005018";"Clara Callan";"Richard Bruce Wright";"2001";"HarperFlamingo Canada";"http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg";"http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg";"http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg"
"0060973129";"Decision in Normandy";"Carlo D'Este";"1991";"HarperPerennial";"http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg";"http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg";"http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg"
"0374157065";"Flu: The Story of the Great Influenza Pandemic of 1918

## Data Preprocessing

In [7]:
%%bash

sed 's/"//g' $HOME/book-crossing/BX-Book-Ratings.csv | sed '1d' > ratings.csv
sed 's/"//g' $HOME/book-crossing/BX-Books.csv | cut -d';' -f -4 | sed '1d' > books.csv

echo
echo '***** Ratings File *****'
head ratings.csv

echo
echo '***** Books File *****'
head books.csv


***** Ratings File *****
276725;034545104X;0
276726;0155061224;5
276727;0446520802;0
276729;052165615X;3
276729;0521795028;6
276733;2080674722;0
276736;3257224281;8
276737;0600570967;6
276744;038550120X;7
276745;342310538;10

***** Books File *****
0195153448;Classical Mythology;Mark P. O. Morford;2002
0002005018;Clara Callan;Richard Bruce Wright;2001
0060973129;Decision in Normandy;Carlo D'Este;1991
0374157065;Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It;Gina Bari Kolata;1999
0393045218;The Mummies of Urumchi;E. J. W. Barber;1999
0399135782;The Kitchen God's Wife;Amy Tan;1991
0425176428;What If?: The World's Foremost Military Historians Imagine What Might Have Been;Robert Cowley;2000
0671870432;PLEADING GUILTY;Scott Turow;1993
0679425608;Under the Black Flag: The Romance and the Reality of Life Among the Pirates;David Cordingly;1996
074322678X;Where You'll Find Me: And Other Stories;Ann Beattie;2002


sed: RE error: illegal byte sequence
sed: RE error: illegal byte sequence


## Pig Latin: Average

In [78]:
%%writefile average.pig

ratings = LOAD 'ratings.csv' USING PigStorage(';') AS (UserID:int, ISBN:chararray, rating:int) ;
books = LOAD 'books.csv' USING PigStorage(';') AS (ISBN:chararray, title:chararray, author:chararray, year:int); 
groups = GROUP ratings BY ISBN ;
avg = FOREACH groups GENERATE group AS GISBN, AVG (ratings.rating) ;
results = JOIN avg by GISBN, books by ISBN ;
ordered_results = ORDER results BY title;
Top_Results = LIMIT ordered_results 10 ;
DUMP Top_Results;

Overwriting average.pig


In [79]:
average_ratings = !pig -x local -f average.pig 2> pig_stderr.log
print('\n'.join(average_ratings))

(1558746218,0.0,1558746218,A Second Chicken Soup for the Woman's Soul (Chicken Soup for the Soul Series),Jack Canfield,1998)
(0345402871,2.0,0345402871,Airframe,Michael Crichton,1997)
(0452264464,4.3076923076923075,0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994)
(0425163091,0.0,0425163091,Chocolate Jesus,Stephan Jaramillo,1998)
(0002005018,4.333333333333333,0002005018,Clara Callan,Richard Bruce Wright,2001)
(0195153448,0.0,0195153448,Classical Mythology,Mark P. O. Morford,2002)
(0060973129,4.0,0060973129,Decision in Normandy,Carlo D'Este,1991)
(0689821166,6.0,0689821166,Flood : Mississippi 1927,Kathleen Duey,1998)
(0374157065,0.0,0374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata,1999)
(1881320189,7.0,1881320189,Goodbye to the Buttermilk Sky,Julia Oliver,1994)


## Cleanup

In [2]:
%%bash
# Remove pig log files
rm -f pig*.log

# Remove our pig scripts
rm -f *.pig

# Remove csv files
rm books.csv ratings.csv