# Data

Reads data from downloaded the [UCSD Book Graph](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home), parses it into smaller csv files, and combines the csv files into a single dataframe.

In [1]:
import json, csv, glob, gzip, os, math, joblib
import pandas as pd

# directory where data is stored
load_DIR = '/media/mthommes/MThommes/Insight/'
# directory where chunks will be stored
chunks_DIR = '/home/mthommes/Documents/insight/reviews/'
# directory where data will be saved
save_DIR = '/home/mthommes/Documents/GitHub/insight/data/'

# Functions

**count_data:** Count the number of lines in a json.gz file

**make_map:** Create a map between \*\_id\_csv and \*\_id, where \* can be book or user.

**parse_data:** Parse a json.gz file into smaller csv files

**csv_to_df:** Convert all csv files in a folder into a single dataframe

In [2]:
def count_data(file_name):
    """
    count_data counts the number of lines in a json.gz file
    INPUT
    file_name: full path of file to read (str)
    OUTPUT
    count: number of lines (int)
    """
    with gzip.open(file_name) as f_open:
        count = 0
        for line in f_open:
            count += 1
    return count

def make_map(file_name, dtype):
    """
    make_map creates a map between *_id_csv and *_id
    INPUTS
    file_name: full path of file to read (str)
    dtype: data type of *id_csv and *id (dict)
    OUTPUT
    mapping: (dict)
    """
    map_df = pd.read_csv(file_name, dtype=dtype,
                         skipinitialspace=True)
    mapping = dict([(v,k) for k,v in map_df.values]) # create mapping between *_id_csv and *_id
    return mapping

def parse_data(file_name, chunks_DIR, user_map, book_map, chunkSize=100):
    """
    parse_data into csv files with lines of chunkSize
    INPUTS
    file_name: full path of file to read (str)
    chunks_DIR: full path to save chunks (str)
    user_map: map to rename user_ids (dict)
    book_map: map to rename book_ids (dict)
    chunkSize: number of lines to read (int)
    """
    chunks = pd.read_json(file_name, lines=True, chunksize=chunkSize)
    for i, c in enumerate(chunks):
        print('...chunk',i)
        # change user_ids from str to int
        c['user_id'] = c['user_id'].map(user_map)
        c['book_id'] = c['book_id'].map(book_map)
        c.to_csv(os.path.join(chunks_DIR, 'chunk_{}.csv').format(i))

def csv_to_df(chunks_DIR, col_names, n_files=1):
    """
    csv_to_df converts csv files to a single DataFrame
    INPUTS
    chunks_DIR: full path to chunk files (str)
    col_names: column names to save (list)
    n_files: number of chunk files
    OUTPUT
    df: data (DataFrame)
    """
    # grab all files from the directory
    files = glob.glob(os.path.join(chunks_DIR, 'chunk_*.csv'))
    if len(files) != n_files:
        print('Error:', len(files), 'out of', n_files, 'found')
    else:
        print('All files found')
    # concatenate files into df and add chunk number as a new column to the df
    df = pd.concat([pd.read_csv(f, usecols=col_names).assign(New=os.path.basename(f).split('.')[0].split('_')[1]) for f in files])
    # rename the new column
    df.rename(columns={'New':'Chunk'}, inplace=True)
    return df

# Maps

Need to convert user_ids and book_ids to numbers instead of strings (because they are smaller to store in memory)

In [3]:
# create mapping between user_id_csv and user_id
file_name = 'user_id_map.csv'
user_map = make_map(os.path.join(load_DIR, file_name),
                    {'user_id_csv':int, 'user_id':str})

# create mapping between book_id_csv and book_id
file_name = 'book_id_map.csv'
book_map = make_map(os.path.join(load_DIR, file_name),
                    {'book_id_csv':int, 'book_id':int})

# Reviews

## Number of Data Points

There are ~15.7M reviews

In [4]:
load_name = 'goodreads_reviews_dedup.json.gz'

n_lines = count_data(os.path.join(load_DIR, load_name))
print('There are', n_lines, 'lines')

There are 15739967 lines


## Parse Data into CSV

Because the files are very large (millions of lines of json strings)

In [5]:
chunkSize = 100000

n_files = math.ceil(n_lines/chunkSize)

print('Parsing into', n_files, 'files...')
parse_data(os.path.join(load_DIR, load_name),
           chunks_DIR, user_map, book_map, chunkSize)

Parsing into 158 files...
...chunk 0
...chunk 1
...chunk 2
...chunk 3
...chunk 4
...chunk 5
...chunk 6
...chunk 7
...chunk 8
...chunk 9
...chunk 10
...chunk 11
...chunk 12
...chunk 13
...chunk 14
...chunk 15
...chunk 16
...chunk 17
...chunk 18
...chunk 19
...chunk 20
...chunk 21
...chunk 22
...chunk 23
...chunk 24
...chunk 25
...chunk 26
...chunk 27
...chunk 28
...chunk 29
...chunk 30
...chunk 31
...chunk 32
...chunk 33
...chunk 34
...chunk 35
...chunk 36
...chunk 37
...chunk 38
...chunk 39
...chunk 40
...chunk 41
...chunk 42
...chunk 43
...chunk 44
...chunk 45
...chunk 46
...chunk 47
...chunk 48
...chunk 49
...chunk 50
...chunk 51
...chunk 52
...chunk 53
...chunk 54
...chunk 55
...chunk 56
...chunk 57
...chunk 58
...chunk 59
...chunk 60
...chunk 61
...chunk 62
...chunk 63
...chunk 64
...chunk 65
...chunk 66
...chunk 67
...chunk 68
...chunk 69
...chunk 70
...chunk 71
...chunk 72
...chunk 73
...chunk 74
...chunk 75
...chunk 76
...chunk 77
...chunk 78
...chunk 79
...chunk 80
...chunk 81


## Convert Data to DataFrame

In [3]:
n_files = 158
# aggregate csv files into single DataFrame
col_names = ['user_id','book_id','rating','read_at','started_at']
print('Converting data to dataframe...')
df = csv_to_df(chunks_DIR, col_names, n_files=n_files)

Converting data to dataframe...
All files found


## Save DataFrame

In [9]:
save_name = 'reviews.gz'
print('Saving data...')
joblib.dump(df, os.path.join(save_DIR, save_name), compress=3);
print('...saved!')

Saving data...
...saved!
