In [90]:
import pandas as pd

history = pd.read_csv('data/unique_video_browsing_history_full_tiktok_data.csv')
history.sort_values(by=['Date'], inplace=True)

# Put the hashtags in lower case from history
history['Hashtags'] = history['Hashtags'].str.lower()

# Create an dataframe with which row being a single hashtag and the count of the hashtag with the columns names: Hashtag and Count
hashtags = history['Hashtags'].str.split(',', expand=True).stack().value_counts().rename_axis('Hashtag').reset_index(name='Count')

hashtags.head(10)

Unnamed: 0,Hashtag,Count
0,#fyp,21735
1,#foryou,9952
2,#viral,7706
3,#foryoupage,7611
4,#edit,7347
5,#fypシ,6398
6,#batman,5114
7,#taylorswift,5059
8,#dccomics,5011
9,#dc,4836


In [91]:
#remove # from hashtags
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace('#', '')

In [92]:
# Remove fyp

pattern = r'(fy\w+|fory\w+|fy|xyzbca|viral|trend\w+)'

# Remove row with re patterns from hashtags['Hashtags']
hashtags = hashtags[~hashtags['Hashtag'].str.contains(pattern)]

hashtags.head(10)


  hashtags = hashtags[~hashtags['Hashtag'].str.contains(pattern)]


Unnamed: 0,Hashtag,Count
4,edit,7347
6,batman,5114
7,taylorswift,5059
8,dccomics,5011
9,dc,4836
11,taylorswift,4211
12,marvel,3959
13,brucewayne,3922
14,swifttok,3149
16,erastour,2777


In [93]:
pattern1 = r'(tiktok|tok|toker)$'
pattern2 = r'^(tiktok|tok|toker)'

hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern1, '')
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern2, '')


  hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern1, '')
  hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern2, '')


In [94]:
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(' ', '')

In [95]:
hashtags.head(10)

Unnamed: 0,Hashtag,Count
4,edit,7347
6,batman,5114
7,taylorswift,5059
8,dccomics,5011
9,dc,4836
11,taylorswift,4211
12,marvel,3959
13,brucewayne,3922
14,swift,3149
16,erastour,2777


In [96]:
import wordninja as wn

# Create a new column with the hashtags splitted by words
hashtags['Hashtags_split'] = hashtags['Hashtag'].apply(lambda x: wn.split(x))
hashtags.reset_index(drop=True, inplace=True)

hashtags.head(10)

Unnamed: 0,Hashtag,Count,Hashtags_split
0,edit,7347,[edit]
1,batman,5114,[batman]
2,taylorswift,5059,"[taylor, swift]"
3,dccomics,5011,"[dc, comics]"
4,dc,4836,[dc]
5,taylorswift,4211,"[taylor, swift]"
6,marvel,3959,[marvel]
7,brucewayne,3922,"[bruce, wayne]"
8,swift,3149,[swift]
9,erastour,2777,"[eras, tour]"


In [97]:

# Remove stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')
hashtags['Hashtags_split'] = hashtags['Hashtags_split'].apply(lambda x: [item for item in x if item not in stop])

hashtags.head(10)


Unnamed: 0,Hashtag,Count,Hashtags_split
0,edit,7347,[edit]
1,batman,5114,[batman]
2,taylorswift,5059,"[taylor, swift]"
3,dccomics,5011,"[dc, comics]"
4,dc,4836,[dc]
5,taylorswift,4211,"[taylor, swift]"
6,marvel,3959,[marvel]
7,brucewayne,3922,"[bruce, wayne]"
8,swift,3149,[swift]
9,erastour,2777,"[eras, tour]"


In [98]:
from collections import defaultdict

# Create a dictionary to aggregate counts
hashtags_counts = defaultdict(int)
hashtags_list = hashtags["Hashtag"]

for i in range(len(hashtags)):
    key = tuple(hashtags['Hashtags_split'][i])

    # Check if the Hashtags_split lists are exactly the same
    if key in hashtags_counts:
        hashtags_counts[key] += hashtags['Count'][i]
        hashtags_list = hashtags_list.drop(i)
    else:
        hashtags_counts[key] = hashtags['Count'][i]


# Create a new DataFrame from the aggregated counts
aggregated_data = {
    'Hashtag': hashtags_list,
    'Hashtags_split': [list(key) for key in hashtags_counts.keys()],
    'Count': list(hashtags_counts.values())
}

aggregated_df = pd.DataFrame(aggregated_data)

# Display the aggregated DataFrame
print(aggregated_df)


                             Hashtag                      Hashtags_split  \
0                               edit                              [edit]   
1                             batman                            [batman]   
2                        taylorswift                     [taylor, swift]   
3                           dccomics                        [dc, comics]   
4                                 dc                                [dc]   
...                              ...                                 ...   
86904                smallartisthelp               [small, artist, help]   
86905                     artsupport                      [art, support]   
86906  anthonymackieandsebastianstan  [anthony, mackie, sebastian, stan]   
86908                   wyldeflowers                    [wylde, flowers]   
86909                      slashfans                       [slash, fans]   

       Count  
0       7762  
1       7533  
2       9715  
3       5422  
4       5809

In [99]:
from sklearn.metrics import jaccard_score
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# Calculate Jaccard similarity between hashtags
def jaccard_similarity(set1, set2):
    if len(set1 | set2) == 0:
        return 0
    return len(set1 & set2) / len(set1 | set2)

similarities = []
hashtags_list = aggregated_df['Hashtags_split'][:10000].tolist()
for i in range(len(hashtags_list)):
    row = []
    for j in range(len(hashtags_list)):
        similarity = jaccard_similarity(set(hashtags_list[i]), set(hashtags_list[j]))
        row.append(similarity)
    similarities.append(row)


similarity_matrix = pd.DataFrame(similarities, columns=aggregated_df['Hashtag'][:10000], index=aggregated_df['Hashtag'][:10000])

  hashtags_list = aggregated_df['Hashtags_split'][:10000].tolist()
  similarity_matrix = pd.DataFrame(similarities, columns=aggregated_df['Hashtag'][:10000], index=aggregated_df['Hashtag'][:10000])


In [100]:
similarity_matrix

Hashtag,edit,batman,taylorswift,dccomics,dc,marvel,brucewayne,swift,erastour,swiftie,...,totaldramaedit,lizzoedit,tessservopoulos,aeazkaban,fifa,productreview,comicbookcollecting,starkid,customkeyboard,alanzaveri
Hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
edit,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
batman,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
taylorswift,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5,0.0,0.333333,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dccomics,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dc,0.0,0.0,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
productreview,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
comicbookcollecting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
starkid,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
customkeyboard,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
# Failed attempt - not in use
from scipy.cluster.hierarchy import linkage, fcluster
import pandas as pd

# Assuming you already have the similarity_matrix DataFrame

# Convert the similarity matrix to a condensed distance matrix
distance_matrix = 1 - similarity_matrix

# Perform hierarchical clustering
linkage_matrix = linkage(distance_matrix, method='average')

# Set a threshold for forming clusters
threshold = 0.99 # Adjust as needed
clusters = fcluster(linkage_matrix, threshold, criterion='distance')

sample_fifty = aggregated_df[:1000]

# Add the cluster labels to the DataFrame
sample_fifty['Cluster'] = clusters

# Display the DataFrame with cluster labels
print(sample_fifty[['Hashtag', 'Cluster']])

In [102]:
import pandas as pd

# Assuming you already have the similarity_matrix DataFrame

# Set a similarity threshold
threshold = 0.35 # Adjust as needed
# 0.49 is good
# 0.1 is bad
# 0.5 is also bad
# 0.3 has cons and pros

# Initialize an empty list to store clusters
clusters = []

# Iterate through the similarity matrix and create clusters
for i in range(len(similarity_matrix)):
    cluster = [similarity_matrix.index[i]]  # Start a new cluster with the current hashtag
    for j in range(i + 1, len(similarity_matrix)):
        if similarity_matrix.iloc[i, j] > threshold:
            cluster.append(similarity_matrix.index[j])  # Add the hashtag to the cluster
    clusters.append(cluster)

sample_fifty = aggregated_df[:10000]

# # Add the cluster labels to the DataFrame
sample_fifty['Cluster'] = clusters
# 
# # Display the DataFrame with cluster labels
print(sample_fifty[['Hashtag', 'Cluster']])

# Create a DataFrame with cluster labels
# cluster_df = pd.DataFrame({'Hashtag': similarity_matrix.index, 'Cluster': clusters})

# Merge the cluster information with the sample_fifty DataFrame
# sample_fifty = pd.merge(sample_fifty, cluster_df, on='Hashtag', how='left')

# Display the DataFrame with cluster labels
# print(sample_fifty[['Hashtag', 'Cluster']])

                   Hashtag                                            Cluster
0                     edit  [edit, batmanedit, dcedit, supermanedit, spide...
1                   batman  [batman, batmanedit, batmanbeyond, batmanandro...
2              taylorswift  [taylorswift, swift, taylorswiftedit, erastour...
3                 dccomics  [dccomics, dc, comics, dccomicsedit, dccomicsu...
4                       dc  [dc, dcuniverse, dcedit, dcau, dcanimated, dce...
...                    ...                                                ...
11995        productreview                                    [productreview]
11997  comicbookcollecting                              [comicbookcollecting]
11998              starkid                                          [starkid]
11999       customkeyboard                                   [customkeyboard]
12000           alanzaveri                                       [alanzaveri]

[10000 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_fifty['Cluster'] = clusters


In [103]:
clusters = sample_fifty[['Hashtag', 'Cluster']]
clusters.to_csv('data/hashtags_clusters.csv', index=False)

In [145]:
clusters = pd.read_csv('data/hashtags_clusters.csv')

In [146]:
# order the clusters dataframe by the number of hashtags in each cluster
clusters_asc = pd.read_csv('data/hashtags_clusters.csv')
clusters_desc = pd.read_csv('data/hashtags_clusters.csv')

clusters_asc['Count'] = clusters_asc['Cluster'].str.split(',').str.len()
clusters_desc['Count'] = clusters_desc['Cluster'].str.split(',').str.len()

clusters_asc.sort_values(by=['Count'], inplace=True, ascending=True)
clusters_desc.sort_values(by=['Count'], inplace=True, ascending=False)
clusters_asc.drop(columns=['Count'], inplace=True)
clusters_desc.drop(columns=['Count'], inplace=True)

# clusters.head(10)


In [147]:
# get the row outside of the list
clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace('[', '')
clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace(']', '')
clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace(" ", '')

clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace('[', '')
clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace(']', '')
clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace(" ", '')

  clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace('[', '')
  clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace(']', '')
  clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace('[', '')
  clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace(']', '')


In [148]:
set(clusters_desc['Cluster'][3673].split(',')).issubset(clusters_desc['Cluster'][7].split(','))

True

In [150]:
# Assuming clusters_asc and clusters_desc are your DataFrames
indexes_to_drop = []
for index, row in clusters_asc.iterrows():
    mask = clusters_desc['Cluster'].apply(lambda x: set(row['Cluster'].split(',')).issubset(set(x.split(','))))
    mask[index] = False  # Exclude the same index
    matching_clusters = clusters_desc[mask]

    if not matching_clusters.empty:
       print(row['Cluster'])
       print(index)
       indexes_to_drop.append(index)

'audios'
6562
'anewhope'
6561
'reddagger'
6560
'melaniewalkeredit'
6559
'genshin'
6558
'disneystore'
6557
'darkside'
6555
'matchingoutfits'
6553
'obikin'
6552
'ponmeenparati'
6550
'tlofuedit'
6549
'legends'
6547
'joeledit'
6538
'13goingon30'
6537
'bellaswanedit'
6533
'successionedit'
6531
'rebecca'
6530
'ninthdoctor'
6528
'barryalen'
6608
'momworkout'
6607
'sketchbook'
6603
'buggyonepiece'
6601
'xenkyendaredit'
6599
'elliewilliamscosplay'
6597
'telltalejoker'
6587
'arieledit'
6586
'rebelsedit'
6581
'mackiswm'
6578
'ucmmarvel'
6576
'johncenadancing'
6575
'tsitpedit'
6573
'cakedecorating'
6590
'spidermangame'
6522
'makeuproutine'
6473
'flyinggraysons'
6472
'halloweencostume'
6471
'makeupartist'
6470
'danielricciardoedit'
6462
'highschoolsweethearts'
6457
'paulandian'
6474
'hellstar'
6453
'pinkhalloween'
6451
'stonymarvel'
6447
'lyricsvideo'
6443
'savethewinchesters'
6442
'brooklynninenineedit'
6441
'charleshadensavageedit'
6440
'learnprogramming'
6454
'robinbuckley'
6519
'erastickets'
65

In [151]:
for index in indexes_to_drop:
    clusters.drop(index, inplace=True)

In [None]:
clusters.to_csv('data/hashtags_clusters_clean.csv', index=False)

In [149]:
# Check if elements of CLuster exist in other clusters
indexes_to_drop = []
for index, row in clusters_asc.iterrows():
    for index2, row2 in clusters_desc.iterrows():
        if index != index2:
            if set(row['Cluster'].split(',')).issubset(set(row2['Cluster'].split(','))):
                print(row['Cluster'])
                print(index)
                indexes_to_drop.append(index)
                #clusters.drop(index, inplace=True)
                break

'audios'
6562
'anewhope'
6561
'reddagger'
6560
'melaniewalkeredit'
6559
'genshin'
6558
'disneystore'
6557
'darkside'
6555
'matchingoutfits'
6553
'obikin'
6552
'ponmeenparati'
6550
'tlofuedit'
6549
'legends'
6547
'joeledit'
6538
'13goingon30'
6537
'bellaswanedit'
6533
'successionedit'
6531
'rebecca'
6530
'ninthdoctor'
6528
'barryalen'
6608
'momworkout'
6607
'sketchbook'
6603
'buggyonepiece'
6601
'xenkyendaredit'
6599
'elliewilliamscosplay'
6597
'telltalejoker'
6587
'arieledit'
6586
'rebelsedit'
6581
'mackiswm'
6578
'ucmmarvel'
6576
'johncenadancing'
6575
'tsitpedit'
6573
'cakedecorating'
6590
'spidermangame'
6522
'makeuproutine'
6473
'flyinggraysons'
6472
'halloweencostume'
6471
'makeupartist'
6470
'danielricciardoedit'
6462
'highschoolsweethearts'
6457
'paulandian'
6474
'hellstar'
6453
'pinkhalloween'
6451
'stonymarvel'
6447
'lyricsvideo'
6443
'savethewinchesters'
6442
'brooklynninenineedit'
6441
'charleshadensavageedit'
6440
'learnprogramming'
6454
'robinbuckley'
6519
'erastickets'
65

KeyboardInterrupt: 

In [None]:
# make a list of hashtags with the same cluster
cluster_list = []
for i in range(1, 50):
    cluster_list.append(sample_fifty[sample_fifty['Cluster'] == i]['Hashtag'].tolist())

# print the list of hashtags with the same cluster
for cluster in cluster_list:
    print(cluster)


[' dc', ' dcuniverse', ' dcau']
[' dcedit']
[' dccomics']
[' comics', ' marvelcomics']
[' marvel']
[' edit', ' batmanedit']
[' batman']
[' taylorswift', ' swift', ' swiftie']
[' taylornation']
[' barbie', ' barbiemovie']
[' tvd', ' tvdu']
[' tedlassoedit', 'tedlasso']
[' brucewayne']
[' brucewayneedit']
[' erastour']
[' spiderman']
[' trending']
[' taylorsversion']
[' mcu']
[' superman']
[' thevampirediaries']
[' starwars']
[' robin']
[' peterparker']
[' aftereffects']
['greenscreen']
[' batfamily']
[' capcut']
[' pedropascal']
[' klausmikaelson']
[' clarkkent']
[' comic']
[' theoriginals']
[' stefansalvatore']
[' nightwing']
[' funny']
[' xmen']
[' parati']
[' milesmorales']
[' edits']
[' trend']
[]
[]
[]
[]
[]
[]
[]
[]


In [None]:
"""
Hierarchical Clustering
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Perform hierarchical clustering
linkage_matrix = linkage(similarity_matrix, method='ward')

# Create a dendrogram
dendrogram(linkage_matrix, labels=df['Hashtag'].tolist(), orientation='right')
plt.title('Hierarchical Clustering Dendrogram')
plt.show()
"""

"""
Determine Clusters
from scipy.cluster.hierarchy import fcluster

# Determine clusters
threshold = 0.2  # Adjust the threshold as needed
clusters = fcluster(linkage_matrix, threshold, criterion='distance')

# Add the cluster labels to the DataFrame
df['Cluster'] = clusters

# Display the DataFrame with cluster labels
print(df[['Hashtag', 'Cluster']])
"""