In [14]:
import pandas as pd

history = pd.read_csv('data/unique_video_browsing_history_full_tiktok_data.csv')
history.sort_values(by=['Date'], inplace=True)

# Put the hashtags in lower case from history
history['Hashtags'] = history['Hashtags'].str.lower()

# Create an dataframe with which row being a single hashtag and the count of the hashtag with the columns names: Hashtag and Count
hashtags = history['Hashtags'].str.split(',', expand=True).stack().value_counts().rename_axis('Hashtag').reset_index(name='Count')

hashtags.head(10)

Unnamed: 0,Hashtag,Count
0,#fyp,21735
1,#foryou,9952
2,#viral,7706
3,#foryoupage,7611
4,#edit,7347
5,#fypシ,6398
6,#batman,5114
7,#taylorswift,5059
8,#dccomics,5011
9,#dc,4836


In [16]:
#remove # from hashtags
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace('#', '')

In [18]:
# Remove fyp

pattern = r'(fy\w+|fory\w+|fy|xyzbca|viral)'

# Remove row with re patterns from hashtags['Hashtags']
hashtags = hashtags[~hashtags['Hashtag'].str.contains(pattern)]

hashtags.head(10)


  hashtags = hashtags[~hashtags['Hashtag'].str.contains(pattern)]


Unnamed: 0,Hashtag,Count
4,edit,7347
6,batman,5114
7,taylorswift,5059
8,dccomics,5011
9,dc,4836
11,taylorswift,4211
12,marvel,3959
13,brucewayne,3922
14,swifttok,3149
16,erastour,2777


In [21]:
pattern1 = r'(tiktok|tok)$'
pattern2 = r'^(tiktok|tok)'

hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern1, '')
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern2, '')


  hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern1, '')
  hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern2, '')


In [23]:
hashtags.head(10)

Unnamed: 0,Hashtag,Count
4,edit,7347
6,batman,5114
7,taylorswift,5059
8,dccomics,5011
9,dc,4836
11,taylorswift,4211
12,marvel,3959
13,brucewayne,3922
14,swift,3149
16,erastour,2777


In [24]:
import wordninja as wn

# Create a new column with the hashtags splitted by words
hashtags['Hashtags_split'] = hashtags['Hashtag'].apply(lambda x: wn.split(x))
hashtags.reset_index(drop=True, inplace=True)

hashtags.head(10)

Unnamed: 0,Hashtag,Count,Hashtags_split
0,edit,7347,[edit]
1,batman,5114,[batman]
2,taylorswift,5059,"[taylor, swift]"
3,dccomics,5011,"[dc, comics]"
4,dc,4836,[dc]
5,taylorswift,4211,"[taylor, swift]"
6,marvel,3959,[marvel]
7,brucewayne,3922,"[bruce, wayne]"
8,swift,3149,[swift]
9,erastour,2777,"[eras, tour]"


In [25]:

# Remove stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')
hashtags['Hashtags_split'] = hashtags['Hashtags_split'].apply(lambda x: [item for item in x if item not in stop])

hashtags.head(10)


Unnamed: 0,Hashtag,Count,Hashtags_split
0,edit,7347,[edit]
1,batman,5114,[batman]
2,taylorswift,5059,"[taylor, swift]"
3,dccomics,5011,"[dc, comics]"
4,dc,4836,[dc]
5,taylorswift,4211,"[taylor, swift]"
6,marvel,3959,[marvel]
7,brucewayne,3922,"[bruce, wayne]"
8,swift,3149,[swift]
9,erastour,2777,"[eras, tour]"


In [31]:
from sklearn.metrics import jaccard_score
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# Calculate Jaccard similarity between hashtags
def jaccard_similarity(set1, set2):
    if len(set1 | set2) == 0:
        return 0
    return len(set1 & set2) / len(set1 | set2)

similarities = []
hashtags_list = hashtags['Hashtags_split'][:50].tolist()
for i in range(len(hashtags_list)):
    row = []
    for j in range(len(hashtags_list)):
        similarity = jaccard_similarity(set(hashtags_list[i]), set(hashtags_list[j]))
        row.append(similarity)
    similarities.append(row)


similarity_matrix = pd.DataFrame(similarities, columns=hashtags['Hashtag'][:50], index=hashtags['Hashtag'][:50])


In [32]:
similarity_matrix

Hashtag,edit,batman,taylorswift,dccomics,dc,taylorswift,marvel,brucewayne,swift,erastour,...,theoriginals,stefansalvatore,nightwing,funny,xmen,parati,milesmorales,tedlassoedit,barbiemovie,barbie
Hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
edit,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
batman,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
taylorswift,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dccomics,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dc,0.0,0.0,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
taylorswift,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
marvel,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
brucewayne,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
swift,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
erastour,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
"""
Hierarchical Clustering
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Perform hierarchical clustering
linkage_matrix = linkage(similarity_matrix, method='ward')

# Create a dendrogram
dendrogram(linkage_matrix, labels=df['Hashtag'].tolist(), orientation='right')
plt.title('Hierarchical Clustering Dendrogram')
plt.show()
"""

"""
Determine Clusters
from scipy.cluster.hierarchy import fcluster

# Determine clusters
threshold = 0.2  # Adjust the threshold as needed
clusters = fcluster(linkage_matrix, threshold, criterion='distance')

# Add the cluster labels to the DataFrame
df['Cluster'] = clusters

# Display the DataFrame with cluster labels
print(df[['Hashtag', 'Cluster']])
"""