In [None]:

import pytz
import json
import os
import warnings

import pandas as pd

import helper
from youtube.config import youtube_watch_history_file, video_details_download_folder, output_dir_unique

warnings.filterwarnings('ignore')

df = pd.read_json(youtube_watch_history_file)

# extract video_id from titleUrl
df['video_id'] = df['titleUrl'].str.extract(r'v=(.*)')

# keep rows where 'details' column is NaN which means not Ads
df = df[df['details'].isna()].reset_index()

# create a list of video_ids from the existing 'video_id' column
video_ids = df['video_id'].tolist()
print(f"total videos: {len(video_ids)}")

# Ensure the 'time' column is of datetime type
df['time'] = pd.to_datetime(df['time'], errors='coerce')
# Convert the 'time' column to Berlin timezone
berlin_tz = pytz.timezone('Europe/Berlin')
df['time'] = df['time'].dt.tz_convert(berlin_tz)
df.to_csv(os.path.join(output_dir_unique, 'watch_history.csv'), index=False)
df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure the 'time' column is of datetime type
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Extract the hour from the 'time' column
df['hour'] = df['time'].dt.hour

# Group by hour and count the number of videos streamed
videos_per_hour = df.groupby('hour').size().reset_index(name='video_count')

# Plot the data
plt.figure(figsize=(12, 6))
plt.bar(videos_per_hour['hour'], videos_per_hour['video_count'], color='skyblue')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Videos Streamed')
plt.title('Number of Videos Streamed per Hour of Day')
plt.xticks(range(24))
plt.grid(True)
# plt.savefig(os.path.join(output_dir_unique, 'videos_streamed_per_hour.png'), bbox_inches='tight')
plt.savefig(os.path.join(output_dir_unique, 'videos_streamed_per_hour.png'), bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure the 'time' column is of datetime type
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Extract the year and month from the 'time' column
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month

# Group by year and month and count the number of videos streamed
videos_per_year_month = df.groupby(['year', 'month']).size().reset_index(name='video_count')

# Create a pivot table for better visualization
pivot_table = videos_per_year_month.pivot(index='month', columns='year', values='video_count')

# Plot the data
plt.figure(figsize=(12, 6))
pivot_table.plot(kind='bar', colormap='viridis', ax=plt.gca())
plt.xlabel('Month')
plt.ylabel('Number of Videos Streamed')
plt.title('Total Videos Streamed per Month and Year')
plt.legend(title='Year')
plt.grid(True)
plt.savefig(os.path.join(output_dir_unique, 'videos_streamed_per_month.png'), bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Ensure the 'time' column is of datetime type
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Extract the year from the 'time' column
df['year'] = df['time'].dt.year

# Group by year and count the number of videos streamed
videos_per_year = df.groupby('year').size().reset_index(name='video_count')

# Plot the data
plt.figure(figsize=(12, 6))
plt.bar(videos_per_year['year'], videos_per_year['video_count'], color='skyblue', width=0.8, align='center')
plt.xlabel('Year')
plt.ylabel('Number of Videos Streamed')
plt.title('Total Videos Streamed per Year')
plt.grid(True)

# Add trendline
z = np.polyfit(videos_per_year['year'], videos_per_year['video_count'], 1)
p = np.poly1d(z)
plt.plot(videos_per_year['year'], p(videos_per_year['year']), color='red', linestyle='--', label='Trendline')
plt.legend()
plt.savefig(os.path.join(output_dir_unique, 'videos_streamed_per_year.png'), bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors

# Ensure the 'time' column is of datetime type
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Extract the year and month from the 'time' column
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month

# Group by year and month and count the number of videos streamed
videos_per_year_month = df.groupby(['year', 'month']).size().reset_index(name='video_count')

# Calculate the number of days in each month
videos_per_year_month['days_in_month'] = videos_per_year_month.apply(
    lambda row: pd.Period(year=row['year'], month=row['month'], freq='M').days_in_month, axis=1
)

# Compute the average number of videos streamed per day for each month
videos_per_year_month['avg_videos_per_day'] = videos_per_year_month['video_count'] / videos_per_year_month['days_in_month']

# Create a pivot table for better visualization
pivot_table = videos_per_year_month.pivot(index='month', columns='year', values='avg_videos_per_day')

# Define a colormap
colors = plt.cm.viridis(np.linspace(0, 1, len(pivot_table.columns)))

# Plot the data
plt.figure(figsize=(12, 6))
pivot_table.plot(kind='bar', color=colors, ax=plt.gca())
plt.xlabel('Month')
plt.ylabel('Average Number of Videos Streamed per Day')
plt.title('Average Videos Streamed per Day per Month and Year')
plt.legend(title='Year')
plt.grid(True)

plt.savefig(os.path.join(output_dir_unique, 'avg_videos_streamed_per_day_per_month.png'), bbox_inches='tight')
plt.show()