In [8]:
%%capture
!pip install datasets
!pip install transformers
!pip install --upgrade accelerate
!pip install evaluate
!pip install datasets==2.14.0 pyarrow==12.0.0

In [9]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import datasets
from datasets import load_dataset#, list_datasets
from evaluate import evaluator
import evaluate
import numpy as np
import torch
import copy
import zipfile
import os
from datasets import load_dataset
from collections import defaultdict

In [10]:
def load_json_dataset(json_path):
    try:
        dataset = load_dataset('json', data_files=json_path, split='train')
        print("Dataset loaded successfully.")
        return dataset
    except Exception as e:
        raise Exception(f"An error occurred while loading the dataset: {e}")

In [12]:
import json
from datasets import load_dataset, Features, Value

# Step 1: Preprocess the JSON file
def preprocess_json_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            data = json.loads(line)
            # Convert 'attributes' field to a JSON string
            if 'attributes' in data and isinstance(data['attributes'], dict):
                data['attributes'] = json.dumps(data['attributes'])
            elif 'attributes' in data and data['attributes'] is None:
                data['attributes'] = 'null'
            # Convert 'hours' field to a JSON string
            if 'hours' in data and isinstance(data['hours'], dict):
                data['hours'] = json.dumps(data['hours'])
            elif 'hours' in data and data['hours'] is None:
                data['hours'] = 'null'
            json.dump(data, outfile)
            outfile.write('\n')

# Run the preprocessing function
preprocess_json_file('yelp_academic_dataset_business.json', 'yelp_academic_dataset_business_processed.json')

# Step 2: Define the features to specify data types
features_business = Features({
    'business_id': Value('string'),
    'name': Value('string'),
    'address': Value('string'),
    'city': Value('string'),
    'state': Value('string'),
    'postal_code': Value('string'),
    'latitude': Value('float32'),
    'longitude': Value('float32'),
    'stars': Value('float32'),
    'review_count': Value('int32'),
    'is_open': Value('int32'),
    'attributes': Value('string'),
    'categories': Value('string'),
    'hours': Value('string'),
})

In [18]:
desired_fields = [
    'user_id',
    'name',
    'review_count',
    'yelping_since',
    'useful',
    'funny',
    'cool',
    'elite',
    'friends'
]

def preprocess_user_json_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            data = json.loads(line)
            
            # 创建一个新的字典，只包含需要的字段
            new_data = {}
            for field in desired_fields:
                if field in data:
                    value = data[field]
                    # 对 'friends' 和 'elite' 字段进行处理
                    if field == 'friends':
                        if value is None or value == '':
                            new_data[field] = 'null'
                        else:
                            new_data[field] = str(value)
                    elif field == 'elite':
                        if value is None or value == '':
                            new_data[field] = 'null'
                        else:
                            new_data[field] = str(value)
                    else:
                        new_data[field] = value
                else:
                    # 如果字段缺失，设置默认值
                    if field in ['review_count', 'useful', 'funny', 'cool']:
                        new_data[field] = 0
                    else:
                        new_data[field] = ''
            json.dump(new_data, outfile)
            outfile.write('\n')

# 运行预处理函数
preprocess_user_json_file('yelp_academic_dataset_user.json', 'yelp_academic_dataset_user_processed.json')

# 步骤 2: 定义数据集的特征
features_user = Features({
    'user_id': Value('string'),
    'name': Value('string'),
    'review_count': Value('int64'),
    'yelping_since': Value('string'),
    'useful': Value('int64'),
    'funny': Value('int64'),
    'cool': Value('int64'),
    'elite': Value('string'),
    'friends': Value('string'),
})

In [13]:
# 步骤 1：预处理 JSON 文件
def preprocess_checkin_json_file(input_path, output_path):
    desired_fields = ['business_id', 'date']
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            data = json.loads(line)
            # 创建一个新的字典，只包含需要的字段
            new_data = {}
            for field in desired_fields:
                value = data.get(field, '')
                if value is None:
                    value = ''
                else:
                    value = str(value).strip()
                new_data[field] = value
            json.dump(new_data, outfile)
            outfile.write('\n')

# 运行预处理函数
preprocess_checkin_json_file('yelp_academic_dataset_checkin.json', 'yelp_academic_dataset_checkin_processed.json')

# 步骤 2：定义数据集特征
features_checkin = Features({
    'business_id': Value('string'),
    'date': Value('string'),
})

In [16]:
def preprocess_review_json_file(input_path, output_path):
    desired_fields = [
        'review_id',
        'user_id',
        'business_id',
        'stars',
        'useful',
        'funny',
        'cool',
        'text',
        'date'
    ]
    
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            data = json.loads(line)
            # 创建一个新的字典，只包含需要的字段
            new_data = {}
            for field in desired_fields:
                if field in data:
                    value = data[field]
                    if value is None:
                        # 设置默认值
                        if field in ['stars', 'useful', 'funny', 'cool']:
                            value = 0
                        else:
                            value = ''
                    new_data[field] = value
                else:
                    # 如果字段缺失，设置默认值
                    if field in ['stars', 'useful', 'funny', 'cool']:
                        new_data[field] = 0
                    else:
                        new_data[field] = ''
            json.dump(new_data, outfile)
            outfile.write('\n')

# 运行预处理函数
preprocess_review_json_file('yelp_academic_dataset_review.json', 'yelp_academic_dataset_review_processed.json')

features_review = Features({
    'review_id': Value('string'),
    'user_id': Value('string'),
    'business_id': Value('string'),
    'stars': Value('int64'),
    'useful': Value('int64'),
    'funny': Value('int64'),
    'cool': Value('int64'),
    'text': Value('string'),
    'date': Value('string'),
})

In [25]:
def preprocess_tip_json_file(input_path, output_path):
    desired_fields_tip = [
        'user_id',
        'business_id',
        'text',
        'date',
        'compliment_count'
    ]
    
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            data = json.loads(line)
            # 创建一个新的字典，只包含需要的字段
            new_data = {}
            for field in desired_fields_tip:
                if field in data:
                    value = data[field]
                    if value is None:
                        # 设置默认值
                        if field == 'compliment_count':
                            value = 0
                        else:
                            value = ''
                    new_data[field] = value
                else:
                    # 如果字段缺失，设置默认值
                    if field == 'compliment_count':
                        new_data[field] = 0
                    else:
                        new_data[field] = ''
            json.dump(new_data, outfile)
            outfile.write('\n')

# 运行预处理函数
preprocess_tip_json_file('yelp_academic_dataset_tip.json', 'yelp_academic_dataset_tip_processed.json')

features_tip = Features({
    'user_id': Value('string'),
    'business_id': Value('string'),
    'text': Value('string'),
    'date': Value('string'),
    'compliment_count': Value('int64'),
})

In [59]:
def load_json_dataset_business(json_path):
    try:
        dataset = load_dataset('json', data_files=json_path, split='train', features=features_business)
        print("Dataset loaded successfully.")
        return dataset
    except Exception as e:
        raise Exception(f"An error occurred while loading the dataset: {e}")


def load_user_dataset(json_path):
    try:
        dataset = load_dataset('json', data_files=json_path, split='train[:8%]', features=features_user)
        desired_columns = ['user_id', 'name', 'review_count', 'yelping_since', 'useful']
        dataset = dataset.select_columns(desired_columns)
        print("数据集加载成功。")
        return dataset
    except Exception as e:
        import traceback
        traceback.print_exc()
        raise Exception(f"加载数据集时发生错误: {e}")


def load_checkin_dataset(json_path):
    try:
        dataset = load_dataset(
            'json',
            data_files=json_path,
            split='train',
            features=features_checkin
        )
        print("数据集加载成功。")
        return dataset
    except Exception as e:
        raise Exception(f"加载数据集时发生错误: {e}")


def load_review_dataset(json_path):
    try:
        dataset = load_dataset(
            'json',
            data_files=json_path,
            split='train[:5%]',
            features=features_review
        )
        print("数据集加载成功。")
        return dataset
    except Exception as e:
        raise Exception(f"加载数据集时发生错误: {e}")


def load_tip_dataset(json_path):
    try:
        dataset = load_dataset(
            'json',
            data_files=json_path,
            split='train[:20%]',
            features=features_tip
        )
        print("数据集加载成功。")
        return dataset
    except Exception as e:
        raise Exception(f"加载数据集时发生错误: {e}")

In [60]:
dataset_business = load_json_dataset_business('yelp_academic_dataset_business_processed.json')
dataset_business

Dataset loaded successfully.


Dataset({
    features: ['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours'],
    num_rows: 150346
})

In [61]:
dataset_user = load_user_dataset('yelp_academic_dataset_user_processed.json')
dataset_user

数据集加载成功。


Dataset({
    features: ['user_id', 'name', 'review_count', 'yelping_since', 'useful'],
    num_rows: 159032
})

In [62]:
dataset_checkin = load_checkin_dataset('yelp_academic_dataset_checkin_processed.json')
dataset_checkin

数据集加载成功。


Dataset({
    features: ['business_id', 'date'],
    num_rows: 131930
})

In [63]:
dataset_review = load_review_dataset('yelp_academic_dataset_review_processed.json')
dataset_review

数据集加载成功。


Dataset({
    features: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'],
    num_rows: 349514
})

In [64]:
dataset_tip = load_tip_dataset('yelp_academic_dataset_tip_processed.json')
dataset_tip

数据集加载成功。


Dataset({
    features: ['user_id', 'business_id', 'text', 'date', 'compliment_count'],
    num_rows: 181783
})

In [65]:
user_ids = set(dataset_user['user_id'])
business_ids = set(dataset_business['business_id'])

In [66]:
def filter_user(batch):
    return [uid in user_ids for uid in batch['user_id']]

def filter_business(batch):
    return [bid in business_ids for bid in batch['business_id']]

In [67]:
filtered_reviews = dataset_review.filter(
    filter_user,
    batched=True,
    batch_size=10000,
    num_proc=4,
    desc="Filter out invalid user_ids"
)

Filter out invalid user_ids (num_proc=4):   0%|          | 0/349514 [00:00<?, ? examples/s]

In [68]:
filtered_reviews

Dataset({
    features: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'],
    num_rows: 267871
})

In [69]:
def filter_business_reviews(batch):
    return [bid in business_ids for bid in batch['business_id']]

filtered_reviews_business = filtered_reviews.filter(
    filter_business_reviews,
    batched=True,
    batch_size=10000,
    num_proc=4,
    desc="Filter out business_id"
)

Filter out business_id (num_proc=4):   0%|          | 0/267871 [00:00<?, ? examples/s]

In [70]:
filtered_reviews_business

Dataset({
    features: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'],
    num_rows: 267871
})

In [71]:
filtered_checkin = dataset_checkin.filter(
    filter_business,
    batched=True,
    batch_size=10000,
    num_proc=4,
    desc="Filter out business_id"
)

In [72]:
filtered_checkin

Dataset({
    features: ['business_id', 'date'],
    num_rows: 131930
})

In [73]:
def filter_users_tip(batch):
    return [uid in user_ids for uid in batch['user_id']]

filtered_tip_user = dataset_tip.filter(
    filter_user,
    batched=True,
    batch_size=10000,
    num_proc=4,
    desc="Filter out invalid user_ids"
)

Filter out invalid user_ids (num_proc=4):   0%|          | 0/181783 [00:00<?, ? examples/s]

In [74]:
filtered_tip_user

Dataset({
    features: ['user_id', 'business_id', 'text', 'date', 'compliment_count'],
    num_rows: 175473
})

In [75]:
filtered_tip_user_business = filtered_tip_user.filter(
    filter_business,
    batched=True,
    batch_size=10000,
    num_proc=4,
    desc="Filter out invalid user_ids"
)

Filter out invalid user_ids (num_proc=4):   0%|          | 0/175473 [00:00<?, ? examples/s]

In [76]:
filtered_tip_user_business

Dataset({
    features: ['user_id', 'business_id', 'text', 'date', 'compliment_count'],
    num_rows: 175473
})

In [77]:
dataset_user

Dataset({
    features: ['user_id', 'name', 'review_count', 'yelping_since', 'useful'],
    num_rows: 159032
})

In [78]:
# 使用 to_csv 方法保存数据集
filtered_reviews_business.to_csv('yelp_dataset/yelp_reviews.csv', index=False)
dataset_user.to_csv('yelp_dataset/yelp_user.csv', index=False)
dataset_business.to_csv('yelp_dataset/yelp_business.csv', index=False)
filtered_checkin.to_csv('yelp_dataset/yelp_checkin.csv', index=False)
filtered_tip_user_business.to_csv('yelp_dataset/yelp_tip.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/268 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/160 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/151 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/132 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/176 [00:00<?, ?ba/s]

22955575

In [79]:
import pandas as pd

# 定义CSV文件路径
csv_file_path = 'yelp_dataset/yelp_user.csv'

# 读取CSV文件为DataFrame
df_business = pd.read_csv(csv_file_path)

df_business.head(100)

Unnamed: 0,user_id,name,review_count,yelping_since,useful
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29
...,...,...,...,...,...
95,8m2LgacB5VeP_1Mn5ZMC4w,Alan,4,2011-01-20 14:58:21,1
96,MvOXPiqRr9IjqVtwC5mUNA,Jessica,111,2010-08-01 20:00:52,286
97,PrJ37Ik9DxritxGPqI9ktw,David,85,2011-05-05 16:01:31,75
98,dHLL7SVGJw5uM6IEp_wb4Q,Constance,538,2010-07-30 22:23:15,694
