In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime

file_path = '/net/pr2/projects/plgrid/plgglscclass/yelp-dataset/yelp_academic_dataset_checkin.json'

data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
print(df.head())
print(df.info())
print(df['date'].iloc[0])

checkin_data = []

for index, row in df.iterrows():
    business_id = row['business_id']
    dates = row['date'].split(', ')
    for date_str in dates:
        try:
            dt = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
            day = dt.strftime('%A')
            hour = dt.hour
            checkin_data.append({
                'business_id': business_id,
                'day': day,
                'hour': hour
            })
        except ValueError:
            print(f"Date parsing error: {date_str} in row {index}")

checkins_df = pd.DataFrame(checkin_data)
print(checkins_df.head())

checkins_by_day = checkins_df.groupby('day').size().reset_index(name='checkin_count')

day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
checkins_by_day['day'] = pd.Categorical(checkins_by_day['day'], categories=day_order, ordered=True)
checkins_by_day = checkins_by_day.sort_values('day')

plt.figure(figsize=(10, 6))
sns.barplot(x='day', y='checkin_count', data=checkins_by_day, palette='Blues_d')
plt.title('Sumaric check-in amount per day')
plt.ylabel('Check-in amount')
plt.xlabel('Day')
plt.xticks(rotation=45)
plt.show()

checkins_by_hour = checkins_df.groupby('hour').size().reset_index(name='checkin_count')

plt.figure(figsize=(12, 6))
sns.lineplot(x='hour', y='checkin_count', data=checkins_by_hour, marker='o')
plt.title('Sumaric check-in amount per hour')
plt.ylabel('Check-in amount')
plt.xlabel('Hour of the day')
plt.xticks(range(0, 24))
plt.grid(True)
plt.show()

top_businesses = checkins_df.groupby('business_id').size().reset_index(name='checkin_count').sort_values(by='checkin_count', ascending=False).head(10)
print(top_businesses)
plt.figure(figsize=(12, 6))
sns.barplot(x='checkin_count', y='business_id', data=top_businesses, palette='viridis')
plt.title('Top 10 businesses')
plt.xlabel('Check-ins')
plt.ylabel('Businness ID')
plt.show()

Pierwsze 5 wierszy zestawu danych:
              business_id                                               date
0  ---kPU91CF4Lq2-WlRu9Lw  2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020...
1  --0iUa4sNDFiZFrAdIWhZQ  2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011...
2  --30_8IhuyMHbSOcNWd6DQ           2013-06-14 23:29:17, 2014-08-13 23:20:22
3  --7PUidqRWpRSpXebiyxTg  2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012...
4  --7jw19RH9JKXgFohspgQw  2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014...

Informacje o danych:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131930 entries, 0 to 131929
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  131930 non-null  object
 1   date         131930 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB
None

Przykładowe dane w kolumnie 'date':
2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020-07-24 22:42:27, 2020-10-24 21:36:13, 2020-12-09 21:23:33, 2021-01-20 17:34