In [90]:
import pandas as pd

history = pd.read_csv('data/unique_video_browsing_history_full_tiktok_data.csv')
history.sort_values(by=['Date'], inplace=True)

# Put the hashtags in lower case from history
history['Hashtags'] = history['Hashtags'].str.lower()

# Create an dataframe with which row being a single hashtag and the count of the hashtag with the columns names: Hashtag and Count
hashtags = history['Hashtags'].str.split(',', expand=True).stack().value_counts().rename_axis('Hashtag').reset_index(name='Count')

hashtags.head(10)

Unnamed: 0,Hashtag,Count
0,#fyp,21735
1,#foryou,9952
2,#viral,7706
3,#foryoupage,7611
4,#edit,7347
5,#fypシ,6398
6,#batman,5114
7,#taylorswift,5059
8,#dccomics,5011
9,#dc,4836


In [91]:
#remove # from hashtags
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace('#', '')

In [92]:
# Remove fyp

pattern = r'(fy\w+|fory\w+|fy|xyzbca|viral|trend\w+)'

# Remove row with re patterns from hashtags['Hashtags']
hashtags = hashtags[~hashtags['Hashtag'].str.contains(pattern)]

hashtags.head(10)


  hashtags = hashtags[~hashtags['Hashtag'].str.contains(pattern)]


Unnamed: 0,Hashtag,Count
4,edit,7347
6,batman,5114
7,taylorswift,5059
8,dccomics,5011
9,dc,4836
11,taylorswift,4211
12,marvel,3959
13,brucewayne,3922
14,swifttok,3149
16,erastour,2777


In [93]:
pattern1 = r'(tiktok|tok|toker)$'
pattern2 = r'^(tiktok|tok|toker)'

hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern1, '')
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern2, '')


  hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern1, '')
  hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern2, '')


In [94]:
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(' ', '')

In [95]:
hashtags.head(10)

Unnamed: 0,Hashtag,Count
4,edit,7347
6,batman,5114
7,taylorswift,5059
8,dccomics,5011
9,dc,4836
11,taylorswift,4211
12,marvel,3959
13,brucewayne,3922
14,swift,3149
16,erastour,2777


In [96]:
import wordninja as wn

# Create a new column with the hashtags splitted by words
hashtags['Hashtags_split'] = hashtags['Hashtag'].apply(lambda x: wn.split(x))
hashtags.reset_index(drop=True, inplace=True)

hashtags.head(10)

Unnamed: 0,Hashtag,Count,Hashtags_split
0,edit,7347,[edit]
1,batman,5114,[batman]
2,taylorswift,5059,"[taylor, swift]"
3,dccomics,5011,"[dc, comics]"
4,dc,4836,[dc]
5,taylorswift,4211,"[taylor, swift]"
6,marvel,3959,[marvel]
7,brucewayne,3922,"[bruce, wayne]"
8,swift,3149,[swift]
9,erastour,2777,"[eras, tour]"


In [97]:

# Remove stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')
hashtags['Hashtags_split'] = hashtags['Hashtags_split'].apply(lambda x: [item for item in x if item not in stop])

hashtags.head(10)


Unnamed: 0,Hashtag,Count,Hashtags_split
0,edit,7347,[edit]
1,batman,5114,[batman]
2,taylorswift,5059,"[taylor, swift]"
3,dccomics,5011,"[dc, comics]"
4,dc,4836,[dc]
5,taylorswift,4211,"[taylor, swift]"
6,marvel,3959,[marvel]
7,brucewayne,3922,"[bruce, wayne]"
8,swift,3149,[swift]
9,erastour,2777,"[eras, tour]"


In [98]:
from collections import defaultdict

# Create a dictionary to aggregate counts
hashtags_counts = defaultdict(int)
hashtags_list = hashtags["Hashtag"]

for i in range(len(hashtags)):
    key = tuple(hashtags['Hashtags_split'][i])

    # Check if the Hashtags_split lists are exactly the same
    if key in hashtags_counts:
        hashtags_counts[key] += hashtags['Count'][i]
        hashtags_list = hashtags_list.drop(i)
    else:
        hashtags_counts[key] = hashtags['Count'][i]


# Create a new DataFrame from the aggregated counts
aggregated_data = {
    'Hashtag': hashtags_list,
    'Hashtags_split': [list(key) for key in hashtags_counts.keys()],
    'Count': list(hashtags_counts.values())
}

aggregated_df = pd.DataFrame(aggregated_data)

# Display the aggregated DataFrame
print(aggregated_df)


                             Hashtag                      Hashtags_split  \
0                               edit                              [edit]   
1                             batman                            [batman]   
2                        taylorswift                     [taylor, swift]   
3                           dccomics                        [dc, comics]   
4                                 dc                                [dc]   
...                              ...                                 ...   
86904                smallartisthelp               [small, artist, help]   
86905                     artsupport                      [art, support]   
86906  anthonymackieandsebastianstan  [anthony, mackie, sebastian, stan]   
86908                   wyldeflowers                    [wylde, flowers]   
86909                      slashfans                       [slash, fans]   

       Count  
0       7762  
1       7533  
2       9715  
3       5422  
4       5809

In [331]:
from sklearn.metrics import jaccard_score
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# Calculate Jaccard similarity between hashtags
def jaccard_similarity(set1, set2):
    if len(set1 | set2) == 0:
        return 0
    return len(set1 & set2) / len(set1 | set2)

similarities = []
hashtags_list = aggregated_df['Hashtags_split'][:10000].tolist()
for i in range(len(hashtags_list)):
    row = []
    for j in range(len(hashtags_list)):
        similarity = jaccard_similarity(set(hashtags_list[i]), set(hashtags_list[j]))
        row.append(similarity)
    similarities.append(row)


similarity_matrix = pd.DataFrame(similarities, columns=aggregated_df['Hashtag'][:10000], index=aggregated_df['Hashtag'][:10000])

  hashtags_list = aggregated_df['Hashtags_split'][:10000].tolist()


In [216]:
similarity_matrix

Hashtag,joeandsophiedivorce,copywriting,roadrunner,droopy,geekfm,flashnegro,bucksfirehose,harperedit,wallwestedit,glenzoey,...,timotheechalametedit,kardashiansjenner,steryotipicalbarbie,nobelprizebarbie,dulapeepbarbie,rafandsara4ever,marvelscenes,lokiwednesday,lokiluafeyson,iphone15promax
Hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
joeandsophiedivorce,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
copywriting,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
roadrunner,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
droopy,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
geekfm,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rafandsara4ever,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
marvelscenes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
lokiwednesday,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.2,0.0
lokiluafeyson,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,1.0,0.0


In [None]:
# Failed attempt - not in use
from scipy.cluster.hierarchy import linkage, fcluster
import pandas as pd

# Assuming you already have the similarity_matrix DataFrame

# Convert the similarity matrix to a condensed distance matrix
distance_matrix = 1 - similarity_matrix

# Perform hierarchical clustering
linkage_matrix = linkage(distance_matrix, method='average')

# Set a threshold for forming clusters
threshold = 0.99 # Adjust as needed
clusters = fcluster(linkage_matrix, threshold, criterion='distance')

sample_fifty = aggregated_df[:1000]

# Add the cluster labels to the DataFrame
sample_fifty['Cluster'] = clusters

# Display the DataFrame with cluster labels
print(sample_fifty[['Hashtag', 'Cluster']])

In [217]:
import pandas as pd

# Assuming you already have the similarity_matrix DataFrame

# Set a similarity threshold
threshold = 0.35 # Adjust as needed
# 0.49 is good
# 0.1 is bad
# 0.5 is also bad
# 0.3 has cons and pros

# Initialize an empty list to store clusters
clusters = []

# Iterate through the similarity matrix and create clusters
for i in range(len(similarity_matrix)):
    cluster = [similarity_matrix.index[i]]  # Start a new cluster with the current hashtag
    for j in range(i + 1, len(similarity_matrix)):
        if similarity_matrix.iloc[i, j] > threshold:
            cluster.append(similarity_matrix.index[j])  # Add the hashtag to the cluster
    clusters.append(cluster)

sample_fifty = aggregated_df[:10000]

# # Add the cluster labels to the DataFrame
sample_fifty['Cluster'] = clusters
# 
# # Display the DataFrame with cluster labels
print(sample_fifty[['Hashtag', 'Cluster']])

# Create a DataFrame with cluster labels
# cluster_df = pd.DataFrame({'Hashtag': similarity_matrix.index, 'Cluster': clusters})

# Merge the cluster information with the sample_fifty DataFrame
# sample_fifty = pd.merge(sample_fifty, cluster_df, on='Hashtag', how='left')

# Display the DataFrame with cluster labels
# print(sample_fifty[['Hashtag', 'Cluster']])

                   Hashtag                                            Cluster
35803  joeandsophiedivorce  [joeandsophiedivorce, sophieandjoedivorce, sop...
35804          copywriting                                      [copywriting]
35805           roadrunner                                       [roadrunner]
35806               droopy                                           [droopy]
35807               geekfm                                           [geekfm]
...                    ...                                                ...
47517      rafandsara4ever                                  [rafandsara4ever]
47519         marvelscenes                                     [marvelscenes]
47521        lokiwednesday                                    [lokiwednesday]
47522        lokiluafeyson                                    [lokiluafeyson]
47524       iphone15promax                                   [iphone15promax]

[10000 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_fifty['Cluster'] = clusters


In [296]:
# put the count of every row in sample_fifty to zero
sample_fifty['Count'] = 0

In [302]:
sample_fifty.head(10)

Unnamed: 0,Hashtag,Hashtags_split,Count,Cluster
35803,joeandsophiedivorce,"[joe, sophie, divorce]",7,"[joeandsophiedivorce, sophieandjoedivorce, sop..."
35804,copywriting,"[copy, writing]",2,[copywriting]
35805,roadrunner,[roadrunner],2,[roadrunner]
35806,droopy,[droopy],2,[droopy]
35807,geekfm,"[geek, fm]",2,[geekfm]
35809,flashnegro,"[flash, negro]",2,[flashnegro]
35810,bucksfirehose,"[bucks, firehose]",2,[bucksfirehose]
35811,harperedit,"[harper, edit]",4,"[harperedit, charlieandharperedit]"
35812,wallwestedit,"[wall, west, edit]",4,"[wallwestedit, iriswestedit]"
35813,glenzoey,"[glen, zoey]",2,[glenzoey]


In [293]:
aggregated_df.head(10)
#find the index of the hashtag in aggregated_df
aggregated_df[aggregated_df['Hashtag'] == 'joeandsophiedivorce']['Count'].tolist()[0]

3

In [None]:
for i in range(0,len(sample_fifty['Cluster'].tolist())):
    for j in sample_fifty['Cluster'].tolist()[i]:
        print(j)
        if j in aggregated_df['Hashtag'].tolist():
            count_to_add = aggregated_df[aggregated_df['Hashtag'] == j]['Count'].tolist()[0]
            print(count_to_add)
            # Update the count in sample_fifty DataFrame using iloc
            sample_fifty.iloc[i, sample_fifty.columns.get_loc('Count')] += count_to_add
            print(sample_fifty.iloc[i, sample_fifty.columns.get_loc('Count')])
        


In [304]:
clusters = sample_fifty[['Hashtag', 'Cluster', 'Count']]
clusters.to_csv('data/hashtags_clusters.csv', index=False)

In [324]:
clusters = pd.read_csv('data/hashtags_clusters.csv')

In [310]:
# order the clusters dataframe by the number of hashtags in each cluster
clusters_asc = pd.read_csv('data/hashtags_clusters.csv')
clusters_desc = pd.read_csv('data/hashtags_clusters.csv')

clusters_asc['Length'] = clusters_asc['Cluster'].str.split(',').str.len()
clusters_desc['Length'] = clusters_desc['Cluster'].str.split(',').str.len()

clusters_asc.sort_values(by=['Length'], inplace=True, ascending=True)
clusters_desc.sort_values(by=['Length'], inplace=True, ascending=False)
clusters_asc.drop(columns=['Length'], inplace=True)
clusters_desc.drop(columns=['Length'], inplace=True)

# clusters.head(10)


In [311]:
# get the row outside of the list
clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace('[', '')
clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace(']', '')
clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace(" ", '')

clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace('[', '')
clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace(']', '')
clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace(" ", '')

  clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace('[', '')
  clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace(']', '')
  clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace('[', '')
  clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace(']', '')


In [148]:
set(clusters_desc['Cluster'][3673].split(',')).issubset(clusters_desc['Cluster'][7].split(','))

True

In [321]:
indexes_to_drop = []
sub_indexes = []

for index, row in clusters_asc.iterrows():
    mask = clusters_desc['Cluster'].apply(lambda x: set(row['Cluster'].split(',')).issubset(set(x.split(','))))
    mask[index] = False  # Exclude the same index
    matching_clusters = clusters_desc[mask]

    if not matching_clusters.empty:
        print(row['Cluster'])
        print(index)
        sub_indexes.append(matching_clusters['Cluster'].index.tolist()[0])
        indexes_to_drop.append(index)

'startrekintodarkness'
6426
'martinscorsese'
6441
'kirkandspock'
6417
'atgylb'
6410
'bumbercatchedit'
6405
'kenoughhoodie'
6443
'batmandaycordoba'
6482
'christiangaiera'
6481
'londonimmersiveexperience'
6476
'película'
6472
'jeepgladiator'
6466
'getducked'
6465
'scisaac'
6448
'sendthistosomeonespecial'
6462
'teentitansredxedit'
6463
'jeepwranglerunlimited'
6456
'vidaglow'
6401
'dizziness'
6349
'hotmomclub'
6333
'haylijahisendgame'
6314
'camcameronedit'
6327
'evento'
6486
'barbiebirthday'
6392
'augustdogtrend'
6390
'batman1989'
6389
'charliebradburryedit'
6386
'makeawish'
6364
'kalijahedit'
6358
'disneymeet'
6371
6606
'amoc'
6604
'ennvsia'
6581
'amazonastro'
6613
'kingsma'
6588
'katherinepiercelover'
6614
'goldnails'
6645
'mochiparis'
6619
'blessingrose'
6625
'bridgitmendlerfan'
6631
'tytennantedit'
6626
'orandge'
6524
'madisonrussell'
6523
'menofletters'
6521
'barbaragordonbatgirl'
6520
'taylxredit'
6519
'mentalhealthquotes'
6510
'formulaonefangirlies'
6494
'alliwantforxmas'
6495
'godz

In [325]:
for index in sub_indexes:
    clusters.iloc[index, clusters.columns.get_loc('Count')] += clusters.iloc[indexes_to_drop[0], clusters.columns.get_loc('Count')]

In [329]:
for index in indexes_to_drop:
    clusters.drop(index, inplace=True)

In [330]:
clusters.to_csv('data/clean_hashtags/hashtags_clusters_clean_0_to_10k.csv', index=False)

In [None]:
# Check if elements of CLuster exist in other clusters
indexes_to_drop = []
for index, row in clusters_asc.iterrows():
    for index2, row2 in clusters_desc.iterrows():
        if index != index2:
            if set(row['Cluster'].split(',')).issubset(set(row2['Cluster'].split(','))):
                print(row['Cluster'])
                print(index)
                indexes_to_drop.append(index)
                #clusters.drop(index, inplace=True)
                break

In [None]:
# make a list of hashtags with the same cluster
cluster_list = []
for i in range(1, 50):
    cluster_list.append(sample_fifty[sample_fifty['Cluster'] == i]['Hashtag'].tolist())

# print the list of hashtags with the same cluster
for cluster in cluster_list:
    print(cluster)


[' dc', ' dcuniverse', ' dcau']
[' dcedit']
[' dccomics']
[' comics', ' marvelcomics']
[' marvel']
[' edit', ' batmanedit']
[' batman']
[' taylorswift', ' swift', ' swiftie']
[' taylornation']
[' barbie', ' barbiemovie']
[' tvd', ' tvdu']
[' tedlassoedit', 'tedlasso']
[' brucewayne']
[' brucewayneedit']
[' erastour']
[' spiderman']
[' trending']
[' taylorsversion']
[' mcu']
[' superman']
[' thevampirediaries']
[' starwars']
[' robin']
[' peterparker']
[' aftereffects']
['greenscreen']
[' batfamily']
[' capcut']
[' pedropascal']
[' klausmikaelson']
[' clarkkent']
[' comic']
[' theoriginals']
[' stefansalvatore']
[' nightwing']
[' funny']
[' xmen']
[' parati']
[' milesmorales']
[' edits']
[' trend']
[]
[]
[]
[]
[]
[]
[]
[]


In [None]:
"""
Hierarchical Clustering
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Perform hierarchical clustering
linkage_matrix = linkage(similarity_matrix, method='ward')

# Create a dendrogram
dendrogram(linkage_matrix, labels=df['Hashtag'].tolist(), orientation='right')
plt.title('Hierarchical Clustering Dendrogram')
plt.show()
"""

"""
Determine Clusters
from scipy.cluster.hierarchy import fcluster

# Determine clusters
threshold = 0.2  # Adjust the threshold as needed
clusters = fcluster(linkage_matrix, threshold, criterion='distance')

# Add the cluster labels to the DataFrame
df['Cluster'] = clusters

# Display the DataFrame with cluster labels
print(df[['Hashtag', 'Cluster']])
"""