In [38]:
## Imports
import pandas as pd
import numpy as np
import time
import os
import json
## See 50 columns in dataframe
pd.options.display.max_columns = 50

In [39]:
## Function from Github user Nick-Morgan
def json_to_csv(directory, fileNames, createSample=False):
    """
    json_to_csv: loops through specified JSON files and converts them to csv files.
                 option to also create a sample csv, which uses np.random.seed 9001 to create a sample dataset with 10% of the observations
    
                 pandas has a read_json function, but returns a 'Trailing data error' when working with these specific files
                 
                 
    Inputs: -directory of JSON files
            -list of JSON filenames
            -createSample flag
    """
    
    directory = os.path.expanduser(directory)
    start = time.time()

    jsonData = []

    for fileName in fileNames:
        with open(directory + fileName,  encoding="utf8") as file:
            print('{0} opened'.format(fileName))
            for line in file:
                #I use an rstrip here because some of the files have trailing blank spaces
                jsonData.append(json.loads(line.rstrip()))
        
        df = pd.DataFrame.from_dict(jsonData)
        
        csvFileName = fileName[:len(fileName)-5] + '.csv'
        
        df.to_csv(directory + csvFileName)
        print('{0} created'.format(csvFileName))
        
        
        if createSample:
            np.random.seed(9001)
            msk = np.random.rand(len(df)) <= 0.1
            sample = df[msk]
            
            csvSampleFileName = fileName[:len(fileName)-5] + '_sample.csv'
            
            sample.to_csv(directory + csvSampleFileName)
            print('{0} created'.format(csvSampleFileName))
        
    print('This function took {} minutes to run'.format((time.time()-start)/60))


In [40]:
path = '/Users/mmoesta/Documents/DA_Training/yelp/dataset/'
filenames = ['business.json','checkin.json','photos.json','review.json','tip.json','user.json']
json_to_csv(full_path,filenames, createSample=False)

business.json opened
business.csv created
checkin.json opened
checkin.csv created
photos.json opened
photos.csv created
review.json opened
review.csv created
tip.json opened
tip.csv created
user.json opened
user.csv created
This function took 18.86998119354248 minutes to run
