-
Notifications
You must be signed in to change notification settings - Fork 0
/
ReadAndProcessFiles.py
51 lines (48 loc) · 1.65 KB
/
ReadAndProcessFiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from gensim import models
from nltk.corpus import stopwords
import json
import pandas as pd
import numpy as np
import csv
cuisines = ["American (Traditional)", "American (New)", "Latin American", "Italian", "Thai",
"Chinese", "Japanese", "Turkish", "French", "Mexican", "German", "Polish", "Greek",
"Pakistani", "Ethiopian", "Taiwanese", "Middle Eastern", "Indian", "Korean", "Vietnamese", "Canadian", ]
list = []
lines = open("yelp_academic_dataset_business.json").readlines()
for line in lines:
b_id = None
cat = None
city = None
star = None
rev_count = None
cuis = "other"
line_list = []
jline = json.loads(line)
for k, v in jline.items():
if k == "review_count":
rev_count = v
if k == "business_id":
b_id = v
if k == "categories":
cat = v
if k == "stars":
star = v
if k == "city":
city = v
if(cat!= None):
for cuisine in cuisines:
if cuisine in cat:
cuis = cuisine
cat.remove(cuisine)
break
if ((cat != None) & (rev_count != None)) & (rev_count >= 10) & (("restaurants" in cat) | ("Restaurants" in cat)):
if((b_id != None) & (star!= None) & (city!=None)):
line_list.append(b_id)
line_list.append(cuis)
line_list.append(cat)
line_list.append(star)
line_list.append(rev_count)
line_list.append(city)
list.append(line_list)
my_df = pd.DataFrame(list)
my_df.to_csv('business.csv', index=False, header=False)