In [1]:
import pandas as pd
import numpy as np
import itertools as it

In [2]:

full_df = pd.read_csv("../../csv_files/filtered_DE.csv")
full_df.columns

Index(['Unnamed: 0', 'video_id', 'title', 'publishedAt', 'categoryId',
       'trending_date', 'tags', 'view_count', 'likes', 'dislikes',
       'comment_count', 'comments_disabled', 'ratings_disabled'],
      dtype='object')

In [3]:
full_df["tags"].head()

0    how to make a curried egg sandwich|curried egg...
1    cake rescue|caek fail|viral cake fails|funny c...
2                                               [None]
3    president|trump|donald|executive|orders|stimul...
4    warner bros|warner brothers|wb|fred hampton|wi...
Name: tags, dtype: object

In [4]:
# Getting a nested list of all tags
# [tag1, tag2, tag3][tag4,tag5,tag6]
# each list represents a video

videosAsListOfTags = (full_df["tags"]
                      .where(lambda s: s != "[None]").dropna()
                      .map(lambda s: s.split('|')))

videosAsListOfTags.head()

0    [how to make a curried egg sandwich, curried e...
1    [cake rescue, caek fail, viral cake fails, fun...
3    [president, trump, donald, executive, orders, ...
4    [warner bros, warner brothers, wb, fred hampto...
5    [Lyon, Juventus, Juventus vs. Lyon, lyon vs ju...
Name: tags, dtype: object

In [5]:
tagsByPopuarity = videosAsListOfTags.explode().value_counts()


tagsByPopuarity.head()

tags
funny        2523
comedy       1591
minecraft    1504
challenge    1043
vlog          854
Name: count, dtype: int64

In [6]:
len(tagsByPopuarity)

242238

In [7]:

numberOfTags = 500
topXTags = tagsByPopuarity.head(numberOfTags)
topXTags.to_csv("../../csv_files/top_tags.csv")	

In [8]:
# get dict with {"tag": position in top X tags}
# {'deutsch': 0, 'lustig': 1, 'funny': 2, 'comedy': 3, 'vlog': 4,...}
tagToID = {k:i for  i , (k,_) in enumerate(topXTags.to_dict().items())}

# np array in the size of the top X tags we want to work with
adj_mat = np.zeros((numberOfTags,numberOfTags), dtype= int)

for vid in videosAsListOfTags:
    for t1, t2 in it.combinations(vid, 2):
        # check if the combination is in the top X tags
        tID1 = tagToID.get(t1)
        tID2 = tagToID.get(t2)
        if tID1 != None and tID2 != None:
            # add +1 to loaction if tags are in top X
            adj_mat[tagToID[t1]][tagToID[t2]] += 1; 

adj_mat = pd.DataFrame(adj_mat)
adj_mat.columns = topXTags.keys()
adj_mat.index = topXTags.keys()
adj_mat

tags,funny,comedy,minecraft,challenge,vlog,gaming,news,NBA,highlights,family friendly,...,fwf,Trahan,pikachu,BLACKPINK,playstation 5,nintendo pokemon,TikTok,Ryan,jhope,ping pong
tags,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
funny,0,554,58,261,63,83,2,0,7,113,...,0,0,0,0,0,0,2,0,0,0
comedy,551,1,25,122,31,22,3,0,2,86,...,0,0,0,0,0,0,17,0,0,0
minecraft,342,100,0,336,11,115,0,0,0,191,...,0,0,2,0,0,0,0,0,0,0
challenge,237,121,8,0,20,59,0,1,2,142,...,0,0,0,0,0,0,1,0,0,0
vlog,173,158,12,97,1,4,0,38,1,91,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nintendo pokemon,0,0,0,0,0,0,0,0,0,0,...,0,0,37,0,0,0,0,0,0,0
TikTok,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ryan,1,0,2,9,0,1,0,0,0,2,...,0,90,0,0,0,0,0,0,0,0
jhope,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# turn the matrix in to an edge table using .melt()
unique_tags = topXTags.keys()

adj_df = adj_mat.rename_axis('Source').reset_index().melt('Source', value_name='Weight', var_name='Target').query('Source != Target').reset_index(drop=True)
adj_df = adj_df[adj_df.Weight != 0]
adj_df.sort_values('Weight', ascending=False, ignore_index=True, inplace=True)


In [10]:
# clean up the edge table
list_of_df = []
for x in unique_tags:
    source_col = adj_df.loc[adj_df['Source'] == x]
    target_col = adj_df.loc[adj_df['Target'] == x]
    filter_df = pd.concat([source_col, target_col], axis=0) 
    filter_df.sort_values('Weight', ascending=False, ignore_index=True, inplace=True)
    filter_df = filter_df.head(5)
    list_of_df.append(filter_df)
adj_df = pd.concat(list_of_df, axis=0, ignore_index=True) 
adj_df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
    



adj_df

Unnamed: 0,Source,Target,Weight
0,funny,comedy,554
1,comedy,funny,551
2,minecraft,funny,342
3,funny,challenge,261
4,challenge,funny,237
...,...,...,...
1976,제이홉,jhope,84
1977,RM,jhope,83
1978,jhope,정국,82
1979,ping pong,family friendly,85


In [11]:

adj_df.to_csv("../../csv_files/adj_edge.csv", index=False)
adj_df.head()
    

Unnamed: 0,Source,Target,Weight
0,funny,comedy,554
1,comedy,funny,551
2,minecraft,funny,342
3,funny,challenge,261
4,challenge,funny,237
