In [31]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools

In [32]:

full_df = pd.read_csv("../../csv_files/filtered_DE.csv")
full_df.columns

Index(['Unnamed: 0', 'video_id', 'title', 'publishedAt', 'categoryId',
       'trending_date', 'tags', 'view_count', 'likes', 'dislikes',
       'comment_count', 'comments_disabled', 'ratings_disabled'],
      dtype='object')

In [33]:
# Getting a nested list of all tags
# [tag1, tag2, tag3][tag4,tag5,tag6]
# each list represents a video

videosAsListOfTags = (full_df["tags"]
                      .where(lambda s: s != "[None]").dropna()
                      .map(lambda s: s.split('|')))

videosAsListOfTags.head()


0    [how to make a curried egg sandwich, curried e...
1    [cake rescue, caek fail, viral cake fails, fun...
3    [president, trump, donald, executive, orders, ...
4    [warner bros, warner brothers, wb, fred hampto...
5    [Lyon, Juventus, Juventus vs. Lyon, lyon vs ju...
Name: tags, dtype: object

In [34]:


temp_df = pd.DataFrame(videosAsListOfTags)
tags_by_videoid = pd.merge(full_df["video_id"], temp_df, left_index=True, right_index=True)
tags_by_videoid.to_csv("../../csv_files/tags_with_id.csv", index=False)
tags_by_videoid.head()

Unnamed: 0,video_id,tags
0,cAtazIk1IYw,"[how to make a curried egg sandwich, curried e..."
1,NYFHnIiA8gE,"[cake rescue, caek fail, viral cake fails, fun..."
3,dO6YihaqtaQ,"[president, trump, donald, executive, orders, ..."
4,sSjtGqRXQ9Y,"[warner bros, warner brothers, wb, fred hampto..."
5,kcynVTnaHw0,"[Lyon, Juventus, Juventus vs. Lyon, lyon vs ju..."


In [35]:
tagsByPopuarity = videosAsListOfTags.explode().value_counts()
# tagsByPopuarity = pd.DataFrame(tagsByPopuarity).reset_index()	
numberOfTags = 501
topXTags = tagsByPopuarity.head(numberOfTags)
count_df = pd.DataFrame(topXTags)
list_of_tags = list(dict(topXTags).keys())


In [36]:


new_data = []

# Iterate over rows in the original DataFrame
for _, row in tags_by_videoid.iterrows():
    video_id = row['video_id']
    tags = row['tags']
    
    # For each tag in the list, create a new row in the new_data list
    for tag in tags:
        new_data.append({'video_id': video_id, 'tags': tag})

# Create a new DataFrame from the new_data list
new_df = pd.DataFrame(new_data)

# Group by 'tag' and aggregate 'video_id' as a list
result_df = new_df.groupby('tags')['video_id'].agg(list).reset_index()

result_df = result_df[result_df['tags'].isin(list_of_tags)]
list_of_video_id = set(itertools.chain.from_iterable(list(result_df['video_id'])))
full_df = full_df[full_df['video_id'].isin(list_of_video_id)]
result_df = pd.merge(result_df, count_df, on='tags', how='outer')

In [37]:
#assign each tag its most common category and how often it occurs
list_of_rows = []
full_df = full_df[["video_id","categoryId"]]
for index, rows in result_df.iterrows():
    tag = rows["tags"]
    id_list = rows["video_id"]
    category_list = []
    count = rows["count"]
    
    
    video_df = full_df[full_df["video_id"].isin(id_list)]
    for index, rows in video_df.iterrows():
        category_list.append(rows["categoryId"])
        
        

    category = max(set(category_list), key = category_list.count)
    

    new_row = {'tag':tag, "category": category, "count": count}
    list_of_rows.append(new_row)
    

tags_with_data = pd.DataFrame(list_of_rows)

tags_with_data.head(10)


Unnamed: 0,tag,category,count
0,,17,115
1,100 days,20,189
2,100 days hardcore,20,112
3,2020,17,289
4,2021,17,249
5,2022,17,215
6,2023,17,132
7,Action,17,114
8,Alternative,10,106
9,Amber,20,101


In [38]:
# give each category a color and replace ghem with the true name
category_df = pd.read_csv("../../csv_files/category_df.csv")
colormap = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000']
tags_with_data = tags_with_data.merge(category_df,how='left', left_on='category', right_on='id')
for index, i in enumerate(set(tags_with_data["id"])):
    tags_with_data['id'].replace(i, colormap[index], inplace=True)
tags_with_data.columns = ["tag","category_id","count","color","category_name"]

In [39]:

tags_with_data.to_csv("../../csv_files/node_data.csv", index=False)
tags_with_data.head()

Unnamed: 0,tag,category_id,count,color,category_name
0,,17,115,#4363d8,Sports
1,100 days,20,189,#911eb4,Gaming
2,100 days hardcore,20,112,#911eb4,Gaming
3,2020,17,289,#4363d8,Sports
4,2021,17,249,#4363d8,Sports
