In [28]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools

In [29]:

full_df = pd.read_csv("../../simulated_postgres/filtered_DE.csv")
full_df.columns

Index(['id', 'video_id', 'title', 'publishedAt', 'categoryId', 'trending_date',
       'tags', 'view_count', 'likes', 'dislikes', 'comment_count',
       'comments_disabled', 'ratings_disabled', 'country'],
      dtype='object')

In [44]:
from sqlalchemy import create_engine


engine = create_engine('postgresql://bigdata:bigdata@localhost:5432/mydatabase')

category_df_postgres = pd.read_sql('filtered_de', engine)


In [30]:
# Getting a nested list of all tags
# [tag1, tag2, tag3][tag4,tag5,tag6]
# each list represents a video

videosAsListOfTags = (full_df["tags"]
                      .where(lambda s: s != "[None]").dropna()
                      .map(lambda s: s.split('|')))

videosAsListOfTags.head()


0    [dazn, DAZN, DAZN Fußball, DAZN Sport, #DAZN, ...
1    [Dreamcatcher, 드림캐쳐, 드캐 컴백, 드캐 티저, 드캐 뮤비, 드캐, ...
2      [wir, feiern, Sinans, Geburtstag, vlog, cansin]
3    [lego, deutsch, lego technic, lego technic 421...
4    [rezo, rezo ja lol ey, ja lol ey, tj, tim jacken]
Name: tags, dtype: object

In [31]:


temp_df = pd.DataFrame(videosAsListOfTags)
tags_by_videoid = pd.merge(full_df["video_id"], temp_df, left_index=True, right_index=True)
tags_by_videoid.head()

Unnamed: 0,video_id,tags
0,mAVa2KdkdHQ,"[dazn, DAZN, DAZN Fußball, DAZN Sport, #DAZN, ..."
1,liyoVKZuw18,"[Dreamcatcher, 드림캐쳐, 드캐 컴백, 드캐 티저, 드캐 뮤비, 드캐, ..."
2,h9nGHakkK-Q,"[wir, feiern, Sinans, Geburtstag, vlog, cansin]"
3,Ak7jMdlzTBk,"[lego, deutsch, lego technic, lego technic 421..."
4,W97km1KbcLY,"[rezo, rezo ja lol ey, ja lol ey, tj, tim jacken]"


In [32]:
tagsByPopuarity = videosAsListOfTags.explode().value_counts()
# tagsByPopuarity = pd.DataFrame(tagsByPopuarity).reset_index()	
numberOfTags = 501
topXTags = tagsByPopuarity.head(numberOfTags)
count_df = pd.DataFrame(topXTags)
list_of_tags = list(dict(topXTags).keys())


In [33]:


new_data = []

# Iterate over rows in the original DataFrame
for _, row in tags_by_videoid.iterrows():
    video_id = row['video_id']
    tags = row['tags']
    
    # For each tag in the list, create a new row in the new_data list
    for tag in tags:
        new_data.append({'video_id': video_id, 'tags': tag})

# Create a new DataFrame from the new_data list
new_df = pd.DataFrame(new_data)

# Group by 'tag' and aggregate 'video_id' as a list
result_df = new_df.groupby('tags')['video_id'].agg(list).reset_index()

result_df = result_df[result_df['tags'].isin(list_of_tags)]
list_of_video_id = set(itertools.chain.from_iterable(list(result_df['video_id'])))
full_df = full_df[full_df['video_id'].isin(list_of_video_id)]
result_df = pd.merge(result_df, count_df, on='tags', how='outer')

In [34]:
#assign each tag its most common category and how often it occurs
list_of_rows = []
full_df = full_df[["video_id","categoryId"]]
for index, rows in result_df.iterrows():
    tag = rows["tags"]
    id_list = rows["video_id"]
    category_list = []
    count = rows["count"]
    
    
    video_df = full_df[full_df["video_id"].isin(id_list)]
    for index, rows in video_df.iterrows():
        category_list.append(rows["categoryId"])
        
        

    category = max(set(category_list), key = category_list.count)
    

    new_row = {'tag':tag, "category": category, "count": count}
    list_of_rows.append(new_row)
    

tags_with_data = pd.DataFrame(list_of_rows)

tags_with_data.head(10)


Unnamed: 0,tag,category,count
0,,17,223
1,2020,24,377
2,2021,24,458
3,2022,24,413
4,2023,24,255
5,50667,24,210
6,7 vs wild,24,166
7,918,2,303
8,ARD,24,209
9,Abgasanlage,2,305


In [35]:
# give each category a color and replace ghem with the true name
category_df = pd.read_csv("../../simulated_postgres/category_df.csv")
print(category_df.columns)
colormap = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000']
tags_with_data = tags_with_data.merge(category_df,how='left', left_on='category', right_on='id')
for index, i in enumerate(set(tags_with_data["id"])):
    tags_with_data['id'].replace(i, colormap[index], inplace=True)
tags_with_data.columns = ["tag","category_id","count","color","category_name"]

Index(['id', 'title'], dtype='object')


In [37]:

tags_with_data.to_csv("../../simulated_postgres/node_data.csv", index=False)
tags_with_data.head()

Unnamed: 0,tag,category_id,count,color,category_name
0,,17,223,#ffe119,Sports
1,2020.0,24,377,#f032e6,Entertainment
2,2021.0,24,458,#f032e6,Entertainment
3,2022.0,24,413,#f032e6,Entertainment
4,2023.0,24,255,#f032e6,Entertainment


In [42]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://bigdata:bigdata@localhost:5432/mydatabase')


tags_with_data.to_sql('node_data', engine, index=False, if_exists='replace')

501