## Exploratory data analysis and basic data transformation - yelp 

### This workbook aims to illustrate the process in: 
- Extract part of the data from the source file 
- Load the data into a pandas dataframe
- Transform and process the data 
- Save the outfile as an csv

In [1]:
# Import the packages 
import numpy as np
import pandas as pd
import os
import json
from datetime import datetime

In [2]:
# Print the available dataset 
# Read the data locally
filepath = 'D:\yelp_dataset'

arr = os.listdir(filepath)
print(arr)

['Dataset_User_Agreement.pdf', 'yelp_academic_dataset_business.json', 'yelp_academic_dataset_business_sample.json', 'yelp_academic_dataset_business_sample_output.csv', 'yelp_academic_dataset_checkin.json', 'yelp_academic_dataset_checkin_sample.json', 'yelp_academic_dataset_checkin_sample_output.csv', 'yelp_academic_dataset_covid_features.json', 'yelp_academic_dataset_review.json', 'yelp_academic_dataset_review_sample.json', 'yelp_academic_dataset_review_sample_output.csv', 'yelp_academic_dataset_tip.json', 'yelp_academic_dataset_tip_sample.json', 'yelp_academic_dataset_tip_sample_output.csv', 'yelp_academic_dataset_user.json', 'yelp_academic_dataset_user_sample.json', 'yelp_academic_dataset_user_sample_output.csv']


## Business data set 
Source file: yelp_academic_dataset_business.json

In [3]:
# Write the first 1000 rows of json data into a sample file 
with open(filepath + '\yelp_academic_dataset_business.json', 'r', encoding='utf-8') as json_in_file: 
    with open(filepath + '\yelp_academic_dataset_business_sample.json', 'w', encoding='utf-8') as json_out_file:
        for index, line in enumerate(json_in_file): 
            json_out_file.write(line)
            if index > 1000:
                break

In [4]:
# Load the json data into a list 
business_sample_data = []

with open(filepath + '\yelp_academic_dataset_business_sample.json', 'r', encoding='utf-8') as sample_file:
    for lines in sample_file: 
        business_sample_data.append(json.loads(lines))

In [5]:
# Read sample data
business_sample_data[20:25]

[{'business_id': 'DLbEQySMW3X7KvkybpEYkw',
  'name': 'Jasmine Nail Spa',
  'address': '4370 SE King Rd, Ste 125',
  'city': 'Portland',
  'state': 'OR',
  'postal_code': '97222',
  'latitude': 45.4475976,
  'longitude': -122.6174267,
  'stars': 3.0,
  'review_count': 7,
  'is_open': 0,
  'attributes': {'RestaurantsPriceRange2': '1', 'ByAppointmentOnly': 'False'},
  'categories': 'Nail Salons, Beauty & Spas',
  'hours': {'Monday': '9:30-18:0',
   'Tuesday': '9:30-18:0',
   'Wednesday': '9:30-18:0',
   'Thursday': '9:30-18:0',
   'Friday': '9:30-18:0',
   'Saturday': '9:30-16:0'}},
 {'business_id': 'AvT2mGkdYwU0ghqDfs-OlQ',
  'name': 'Tire Town Auto Service',
  'address': '2032 Hastings Street E',
  'city': 'Vancouver',
  'state': 'BC',
  'postal_code': 'V5L 1T8',
  'latitude': 49.2810588,
  'longitude': -123.0630193,
  'stars': 4.0,
  'review_count': 6,
  'is_open': 0,
  'attributes': None,
  'categories': 'Auto Repair, Automotive, Tires',
  'hours': {'Monday': '9:0-17:30',
   'Tuesday'

Since it is a list of nested dict, it is necessary to flatten the nested dict before loading it to pandas dataframe

In [6]:
# Flatten the nested json
def flatten_nested_dict(d):
    output = {}
    
    for k,v in d.items():
        if isinstance(v, dict):
            v = [v]
        if isinstance(v, list): 
            for subdict in v:
                deeper = flatten_nested_dict(subdict).items()
                output.update({k + '_' + key2: val2 for key2, val2 in deeper})
        else: 
            output[k] = v
    
    return output

business_data_list = []
for items in range(0, len(business_sample_data)): 
    flatten_business_data = flatten_nested_dict(business_sample_data[items])
    business_data_list.append(flatten_business_data)

In [7]:
business_data = pd.DataFrame(business_data_list)
business_data

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,attributes_GoodForDancing,attributes_BestNights,attributes_Music,attributes_BYOB,attributes_CoatCheck,attributes_Smoking,attributes_DriveThru,attributes_BYOBCorkage,attributes_Corkage,attributes_RestaurantsCounterService
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,...,,,,,,,,,,
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,...,,,,,,,,,,
2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,45.511907,-122.613693,4.5,13,...,,,,,,,,,,
3,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,28.914482,-81.295979,3.0,8,...,,,,,,,,,,
4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,9irir2PpYr_yY5lNWYqUWA,Subaru North Orlando,4113 S Orlando Dr,Sanford,FL,32773,28.749161,-81.291087,4.0,32,...,,,,,,,,,,
998,2cG2YiR8Tg-YCJJfaHJtYw,Elements Massage - Medford,40 High St,Medford,MA,02155,42.418388,-71.111116,4.0,55,...,,,,,,,,,,
999,OhpdMOXon32xdJ0-OeL5-Q,Pirate's Cove,7417 NE Sandy Blvd,Portland,OR,97213,45.549482,-122.586522,3.5,19,...,,,,,,,,,,
1000,ivR_bKFNBaEiHfPYDPkFSg,Tony's Nails & Spa,2105 Lavista Rd 112,Atlanta,GA,30329,33.814607,-84.312023,3.5,64,...,,,,,,,,,,


To achiever the goal, we need the following columns: 
- business_id, 
- name, 
- address, 
- city, 
- state, 
- postal_code, 
- stars, 
- review_count
- is_open 
- categories

In [8]:
business_data = business_data[['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'stars', 'review_count', 'is_open', 'categories']]
business_data

Unnamed: 0,business_id,name,address,city,state,postal_code,stars,review_count,is_open,categories
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,4.0,86,1,"Gastropubs, Food, Beer Gardens, Restaurants, B..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,4.0,126,1,"Salad, Soup, Sandwiches, Delis, Restaurants, C..."
2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,4.5,13,1,"Antiques, Fashion, Used, Vintage & Consignment..."
3,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,3.0,8,1,"Beauty & Spas, Hair Salons"
4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,4.0,14,1,"Gyms, Active Life, Interval Training Gyms, Fit..."
...,...,...,...,...,...,...,...,...,...,...
997,9irir2PpYr_yY5lNWYqUWA,Subaru North Orlando,4113 S Orlando Dr,Sanford,FL,32773,4.0,32,1,"Car Dealers, Auto Parts & Supplies, Auto Repai..."
998,2cG2YiR8Tg-YCJJfaHJtYw,Elements Massage - Medford,40 High St,Medford,MA,02155,4.0,55,1,"Health & Medical, Massage Therapy, Massage, Be..."
999,OhpdMOXon32xdJ0-OeL5-Q,Pirate's Cove,7417 NE Sandy Blvd,Portland,OR,97213,3.5,19,1,"Bars, Nightlife, Adult Entertainment"
1000,ivR_bKFNBaEiHfPYDPkFSg,Tony's Nails & Spa,2105 Lavista Rd 112,Atlanta,GA,30329,3.5,64,1,"Beauty & Spas, Nail Salons"


In [9]:
# Save the output into a csv file
business_data.to_csv(filepath + '\yelp_academic_dataset_business_sample_output.csv')

## Review data set

In [10]:
# Write the first 1000 rows of json data into a sample file 
with open(filepath + '\yelp_academic_dataset_review.json', 'r', encoding='utf-8') as json_in_file: 
    with open(filepath + '\yelp_academic_dataset_review_sample.json', 'w', encoding='utf-8') as json_out_file:
        for index, line in enumerate(json_in_file): 
            json_out_file.write(line)
            if index > 1000:
                break

In [11]:
review_data = pd.read_json(filepath + '\yelp_academic_dataset_review_sample.json', lines=True)
review_data

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4,1,0,0,This store is pretty good. Not as great as Wal...,2015-07-03 20:38:25
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5,0,0,0,I called WVM on the recommendation of a couple...,2013-05-28 20:38:06
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2,1,1,1,I've stayed at many Marriott and Renaissance M...,2010-01-08 02:29:15
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01
...,...,...,...,...,...,...,...,...,...
997,_RDqlhTily1mSmltqJAp0A,LK3NnWGLcnxU2i8PCHFaTA,VkV9tfARXGzUjqmRdze_WA,5,4,0,1,Went here for a birthday celebration. Birthda...,2010-03-12 18:41:22
998,EKbOCFXKiR_APXI2mXEsmw,mcPMdBzh8EJWATLvHp5H0A,_rt-Z934kfFzgG19nTrIcQ,5,0,0,0,Great drinks at the tequila bar! Cierra recomm...,2016-05-07 02:12:44
999,zagjXvZl-lsIOEwibvJnzQ,2Th9kCvo3yydQuHNgayDBw,eAPY_lYEo5FShEaeoXtQKA,5,1,0,0,We come here about once a week. Pizza is delic...,2016-08-25 15:40:15
1000,OzDCTMse4NnFJT57N2jq4w,gJNGt9dusI9rocfFXYrMSg,yYLNSTZB_AE0zTZAlGmdUw,5,1,0,0,Very professional and passionate about their w...,2016-10-26 15:05:12


In this table, we will need all the information, so we can export the result with all columns

In [12]:
# Save the output into a csv file
review_data.to_csv(filepath + '\yelp_academic_dataset_review_sample_output.csv')

## Tip data set

In [13]:
# Write the first 1000 rows of json data into a sample file 
with open(filepath + '\yelp_academic_dataset_tip.json', 'r', encoding='utf-8') as json_in_file: 
    with open(filepath + '\yelp_academic_dataset_tip_sample.json', 'w', encoding='utf-8') as json_out_file:
        for index, line in enumerate(json_in_file): 
            json_out_file.write(line)
            if index > 1000:
                break

In [14]:
tip_data = pd.read_json(filepath + '\yelp_academic_dataset_tip_sample.json', lines=True)
tip_data

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,WCjg0jdHXMlwbqS9tZUx8Q,ENwBByjpoa5Gg7tKgxqwLg,Carne asada chips...,2011-07-22 19:07:35,0
1,42-Z02y9bABShAGZhuSzrQ,jKO4Og6ucdX2-YCTKQVYjg,Best happy hour from 3pm to 6pm! $1 off martin...,2014-09-10 07:33:29,0
2,5u7E3LYp_3eB8dLuUBazXQ,9Bto7mky640ocgezVKSfVg,"Nice people, skilled staff, clean location - b...",2013-12-13 23:23:41,0
3,wDWoMG5N9oI4DJ-p7z8EBg,XWFjKtRGZ9khRGtGg2ZvaA,"1/2-price bowling & the ""Very"" Old Fashion are...",2017-07-11 23:07:16,0
4,JmuFlorjjRshHTKzTwNtgg,mkrx0VhSMU3p3uhyJGCoWA,"Solid gold's. Great sauna. Great staff, too. E...",2016-11-30 08:46:36,0
...,...,...,...,...,...
997,bFT7y7vTG1z63IyA7fsWsg,VznOFJQbZ9PZKrzdXkkiiA,instant seating great beer lovely shade on the...,2015-06-06 20:19:37,0
998,UKHa4BDvcIvdVoiJsYYbnw,hwfF4tuCVBlFXY-K1pjeDA,Great cuisine!! I was here for brunch!!! P.S: ...,2017-10-19 20:52:09,0
999,HONe4kg3qn2AxNlABqsvCA,1OweB1FS_Q21b7IyqewGGQ,Their shrimp salad sandwich on challah bread i...,2011-10-02 04:02:06,0
1000,I6B-COW6X7EATIQBH1gYLg,8HwwGSHlCEGbJiMDx_fkHg,Noodles were prefect,2013-08-24 03:43:21,0


In this table, we will need all the information, so we can export the result with all columns

In [15]:
tip_data.to_csv(filepath + '\yelp_academic_dataset_tip_sample_output.csv')

## User data set

In [16]:
# Write the first 1000 rows of json data into a sample file 
with open(filepath + '\yelp_academic_dataset_user.json', 'r', encoding='utf-8') as json_in_file: 
    with open(filepath + '\yelp_academic_dataset_user_sample.json', 'w', encoding='utf-8') as json_out_file:
        for index, line in enumerate(json_in_file): 
            json_out_file.write(line)
            if index > 1000:
                break

In [17]:
user_data = pd.read_json(filepath + '\yelp_academic_dataset_user_sample.json', lines=True)
user_data

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1220,2005-03-14 20:26:35,15038,10030,11291,200620072008200920102011201220132014,"xBDpTUbai0DXrvxCe3X16Q, 7GPNBO496aecrjJfW6UWtg...",1357,...,163,190,361,147,1212,5691,2541,2541,815,323
1,dIIKEfOgo0KqUfGQvGikPg,Gabi,2136,2007-08-10 19:01:51,21272,10289,18046,"2007,2008,2009,2010,2011,2012,2013,2014,2015,2...","XPzYf9_mwG2eXYP2BAGSTA, 2LooM5dcIk2o01nftYdPIg...",1025,...,87,94,232,96,1187,3293,2205,2205,472,294
2,D6ErcUnFALnCQN4b1W_TlA,Jason,119,2007-02-07 15:47:53,188,128,130,20102011,"GfB6sC4NJQvSI2ewbQrDNA, jhZtzZNNZJOU2YSZ6jPlXQ...",16,...,1,3,0,0,5,20,31,31,3,1
3,JnPIjvC0cmooNDfsa9BmXg,Kat,987,2009-02-09 16:14:29,7234,4722,4035,200920102011201220132014,"HQZPQhKMwRAyS6BCselVWQ, kP2U1s_sjQfHO9grxiyDTA...",420,...,129,93,219,90,1120,4510,1566,1566,391,326
4,37Hc8hr3cw0iHLoPzLK6Ow,Christine,495,2008-03-03 04:57:05,1577,727,1124,200920102011,"-Q88pZUcrfN0BLBDp-bkAQ, etPn4Pv1Gc4cRZjRgB_BOw...",47,...,19,32,16,15,77,131,310,310,98,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,VvmEMZxJk3zp9lyGDkjxRw,Nikki,229,2005-08-22 21:21:50,565,271,401,20072008,"sZUC9GbwvDHWrHl0D_d2dA, 89yr7wNIk8TE03yKfTRLuQ...",20,...,4,7,6,4,17,29,62,62,17,2
998,ZNJiYLz_6udX3ljXPZ9FJg,Olma,402,2006-06-24 05:14:10,1057,560,916,20062007200820092010,"hGenPmSp8JEg2w5pelLdKg, 0XXIv9a0LWiaCjAkurjMMQ...",35,...,7,11,15,3,35,168,199,199,43,16
999,ZH-OEKC-dOGQszEnXLpXnw,Dave,109,2009-08-27 12:42:43,554,550,318,201120122013,"P2zPtRoac-_ltqqTlPXM0Q, eLOliEa71F4o0P39lWuk9g...",12,...,2,3,2,0,29,51,76,76,23,7
1000,U3PbDEgVRJQrryCK_uiyVw,Kana,57,2009-04-16 02:06:47,71,24,37,,"i1TJRbN1HTJpv88SzN8vqA, P6VJObgsgQLHJWBz7XUwWQ...",7,...,0,0,1,0,1,5,2,2,0,2


In [18]:
print(user_data.columns) 

Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')


To achiever the goal, we need the following columns:
- user_id
- name
- review_count
- yelping_since
- useful
- funny
- cool
- fans
- average_stars

In [19]:
user_data = user_data[['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny', 'cool', 'fans', 'average_stars']]
user_data

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,fans,average_stars
0,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1220,2005-03-14 20:26:35,15038,10030,11291,1357,3.85
1,dIIKEfOgo0KqUfGQvGikPg,Gabi,2136,2007-08-10 19:01:51,21272,10289,18046,1025,4.09
2,D6ErcUnFALnCQN4b1W_TlA,Jason,119,2007-02-07 15:47:53,188,128,130,16,3.76
3,JnPIjvC0cmooNDfsa9BmXg,Kat,987,2009-02-09 16:14:29,7234,4722,4035,420,3.77
4,37Hc8hr3cw0iHLoPzLK6Ow,Christine,495,2008-03-03 04:57:05,1577,727,1124,47,3.72
...,...,...,...,...,...,...,...,...,...
997,VvmEMZxJk3zp9lyGDkjxRw,Nikki,229,2005-08-22 21:21:50,565,271,401,20,3.78
998,ZNJiYLz_6udX3ljXPZ9FJg,Olma,402,2006-06-24 05:14:10,1057,560,916,35,3.76
999,ZH-OEKC-dOGQszEnXLpXnw,Dave,109,2009-08-27 12:42:43,554,550,318,12,3.68
1000,U3PbDEgVRJQrryCK_uiyVw,Kana,57,2009-04-16 02:06:47,71,24,37,7,4.42


In [20]:
user_data.to_csv(filepath + '\yelp_academic_dataset_user_sample_output.csv')

## Time data

In [21]:
with open(filepath + '\yelp_academic_dataset_checkin.json', 'r', encoding='utf-8') as json_in_file: 
    with open(filepath + '\yelp_academic_dataset_checkin_sample.json', 'w', encoding='utf-8') as json_out_file:
        for index, line in enumerate(json_in_file): 
            json_out_file.write(line)
            if index > 1000:
                break

In [22]:
checkin_data = pd.read_json(filepath + '\yelp_academic_dataset_checkin_sample.json', lines=True)
checkin_data

Unnamed: 0,business_id,date
0,--0r8K_AQ4FZfLsX3ZYRDA,2017-09-03 17:13:59
1,--0zrn43LEaB4jUWTQH_Bg,"2010-10-08 22:21:20, 2010-11-01 21:29:14, 2010..."
2,--164t1nclzzmca7eDiJMw,"2010-02-26 02:06:53, 2010-02-27 08:00:09, 2010..."
3,--2aF9NhXnNVpDV0KS3xBQ,"2014-11-03 16:35:35, 2015-01-30 18:16:03, 2015..."
4,--2mEJ63SC_8_08_jGgVIg,"2010-12-15 17:10:46, 2013-12-28 00:27:54, 2015..."
...,...,...
997,-SjRCXID7eXewqloY3V86w,"2015-12-13 02:48:00, 2016-01-21 22:31:31, 2016..."
998,-Sjrz1Mt9RY4r6ibxzGs0Q,"2016-08-08 19:23:27, 2016-08-15 16:03:29, 2016..."
999,-Sk9ZND7V2x8RuauMH0FRw,"2010-09-05 02:04:25, 2010-10-15 22:48:00, 2010..."
1000,-SkNedh2bJHPOcKfoFlTvg,"2013-09-01 02:54:45, 2013-10-22 16:59:13, 2013..."


In [23]:
# Filter the checkin data which are older than 2018 
checkin_data['date'] = checkin_data['date'].str.split(',')

In [24]:
checkin_data

Unnamed: 0,business_id,date
0,--0r8K_AQ4FZfLsX3ZYRDA,[2017-09-03 17:13:59]
1,--0zrn43LEaB4jUWTQH_Bg,"[2010-10-08 22:21:20, 2010-11-01 21:29:14, 2..."
2,--164t1nclzzmca7eDiJMw,"[2010-02-26 02:06:53, 2010-02-27 08:00:09, 2..."
3,--2aF9NhXnNVpDV0KS3xBQ,"[2014-11-03 16:35:35, 2015-01-30 18:16:03, 2..."
4,--2mEJ63SC_8_08_jGgVIg,"[2010-12-15 17:10:46, 2013-12-28 00:27:54, 2..."
...,...,...
997,-SjRCXID7eXewqloY3V86w,"[2015-12-13 02:48:00, 2016-01-21 22:31:31, 2..."
998,-Sjrz1Mt9RY4r6ibxzGs0Q,"[2016-08-08 19:23:27, 2016-08-15 16:03:29, 2..."
999,-Sk9ZND7V2x8RuauMH0FRw,"[2010-09-05 02:04:25, 2010-10-15 22:48:00, 2..."
1000,-SkNedh2bJHPOcKfoFlTvg,"[2013-09-01 02:54:45, 2013-10-22 16:59:13, 2..."


In [26]:
def convert_to_date(d): 
    x = pd.to_datetime(d)
    return x[x >= '2018-01-01'].tolist()

checkin_data['date'] = checkin_data['date'].apply(convert_to_date)

In [27]:
checkin_data

Unnamed: 0,business_id,date
0,--0r8K_AQ4FZfLsX3ZYRDA,[]
1,--0zrn43LEaB4jUWTQH_Bg,[]
2,--164t1nclzzmca7eDiJMw,[]
3,--2aF9NhXnNVpDV0KS3xBQ,"[2018-02-12 23:13:56, 2019-04-22 19:34:48, 202..."
4,--2mEJ63SC_8_08_jGgVIg,[]
...,...,...
997,-SjRCXID7eXewqloY3V86w,[]
998,-Sjrz1Mt9RY4r6ibxzGs0Q,"[2018-01-01 21:54:06, 2018-01-19 20:58:06, 201..."
999,-Sk9ZND7V2x8RuauMH0FRw,[]
1000,-SkNedh2bJHPOcKfoFlTvg,"[2018-01-02 19:45:42, 2018-01-12 23:19:41, 201..."


In [28]:
checkin_data = checkin_data.explode('date').reset_index(drop=True)

In [29]:
checkin_data = checkin_data.dropna()
checkin_data

Unnamed: 0,business_id,date
3,--2aF9NhXnNVpDV0KS3xBQ,2018-02-12 23:13:56
4,--2aF9NhXnNVpDV0KS3xBQ,2019-04-22 19:34:48
5,--2aF9NhXnNVpDV0KS3xBQ,2020-12-29 16:22:00
10,--JKSSgnfoOjVDFGv692BA,2018-09-12 14:04:02
12,--Q3mAcX9t63f7Xcbn7LVA,2020-07-15 22:29:52
...,...,...
26308,-SkwKPbo5oK1-NtKkupNvw,2018-09-09 23:06:06
26309,-SkwKPbo5oK1-NtKkupNvw,2019-04-29 00:09:17
26310,-SkwKPbo5oK1-NtKkupNvw,2019-09-05 23:10:34
26311,-SkwKPbo5oK1-NtKkupNvw,2020-05-02 18:47:48


In [30]:
checkin_data.to_csv(filepath + '\yelp_academic_dataset_checkin_sample_output.csv')