In [90]:
import pandas as pd

history = pd.read_csv('data/unique_video_browsing_history_full_tiktok_data.csv')
history.sort_values(by=['Date'], inplace=True)

# Put the hashtags in lower case from history
history['Hashtags'] = history['Hashtags'].str.lower()

# Create an dataframe with which row being a single hashtag and the count of the hashtag with the columns names: Hashtag and Count
hashtags = history['Hashtags'].str.split(',', expand=True).stack().value_counts().rename_axis('Hashtag').reset_index(name='Count')

hashtags.head(10)

Unnamed: 0,Hashtag,Count
0,#fyp,21735
1,#foryou,9952
2,#viral,7706
3,#foryoupage,7611
4,#edit,7347
5,#fypシ,6398
6,#batman,5114
7,#taylorswift,5059
8,#dccomics,5011
9,#dc,4836


In [91]:
#remove # from hashtags
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace('#', '')

In [92]:
# Remove fyp

pattern = r'(fy\w+|fory\w+|fy|xyzbca|viral|trend\w+)'

# Remove row with re patterns from hashtags['Hashtags']
hashtags = hashtags[~hashtags['Hashtag'].str.contains(pattern)]

hashtags.head(10)


  hashtags = hashtags[~hashtags['Hashtag'].str.contains(pattern)]


Unnamed: 0,Hashtag,Count
4,edit,7347
6,batman,5114
7,taylorswift,5059
8,dccomics,5011
9,dc,4836
11,taylorswift,4211
12,marvel,3959
13,brucewayne,3922
14,swifttok,3149
16,erastour,2777


In [93]:
pattern1 = r'(tiktok|tok|toker)$'
pattern2 = r'^(tiktok|tok|toker)'

hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern1, '')
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern2, '')


  hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern1, '')
  hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(pattern2, '')


In [94]:
hashtags['Hashtag'] = hashtags['Hashtag'].str.replace(' ', '')

In [95]:
hashtags.head(10)

Unnamed: 0,Hashtag,Count
4,edit,7347
6,batman,5114
7,taylorswift,5059
8,dccomics,5011
9,dc,4836
11,taylorswift,4211
12,marvel,3959
13,brucewayne,3922
14,swift,3149
16,erastour,2777


In [96]:
import wordninja as wn

# Create a new column with the hashtags splitted by words
hashtags['Hashtags_split'] = hashtags['Hashtag'].apply(lambda x: wn.split(x))
hashtags.reset_index(drop=True, inplace=True)

hashtags.head(10)

Unnamed: 0,Hashtag,Count,Hashtags_split
0,edit,7347,[edit]
1,batman,5114,[batman]
2,taylorswift,5059,"[taylor, swift]"
3,dccomics,5011,"[dc, comics]"
4,dc,4836,[dc]
5,taylorswift,4211,"[taylor, swift]"
6,marvel,3959,[marvel]
7,brucewayne,3922,"[bruce, wayne]"
8,swift,3149,[swift]
9,erastour,2777,"[eras, tour]"


In [97]:

# Remove stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')
hashtags['Hashtags_split'] = hashtags['Hashtags_split'].apply(lambda x: [item for item in x if item not in stop])

hashtags.head(10)


Unnamed: 0,Hashtag,Count,Hashtags_split
0,edit,7347,[edit]
1,batman,5114,[batman]
2,taylorswift,5059,"[taylor, swift]"
3,dccomics,5011,"[dc, comics]"
4,dc,4836,[dc]
5,taylorswift,4211,"[taylor, swift]"
6,marvel,3959,[marvel]
7,brucewayne,3922,"[bruce, wayne]"
8,swift,3149,[swift]
9,erastour,2777,"[eras, tour]"


In [98]:
from collections import defaultdict

# Create a dictionary to aggregate counts
hashtags_counts = defaultdict(int)
hashtags_list = hashtags["Hashtag"]

for i in range(len(hashtags)):
    key = tuple(hashtags['Hashtags_split'][i])

    # Check if the Hashtags_split lists are exactly the same
    if key in hashtags_counts:
        hashtags_counts[key] += hashtags['Count'][i]
        hashtags_list = hashtags_list.drop(i)
    else:
        hashtags_counts[key] = hashtags['Count'][i]


# Create a new DataFrame from the aggregated counts
aggregated_data = {
    'Hashtag': hashtags_list,
    'Hashtags_split': [list(key) for key in hashtags_counts.keys()],
    'Count': list(hashtags_counts.values())
}

aggregated_df = pd.DataFrame(aggregated_data)

# Display the aggregated DataFrame
print(aggregated_df)


                             Hashtag                      Hashtags_split  \
0                               edit                              [edit]   
1                             batman                            [batman]   
2                        taylorswift                     [taylor, swift]   
3                           dccomics                        [dc, comics]   
4                                 dc                                [dc]   
...                              ...                                 ...   
86904                smallartisthelp               [small, artist, help]   
86905                     artsupport                      [art, support]   
86906  anthonymackieandsebastianstan  [anthony, mackie, sebastian, stan]   
86908                   wyldeflowers                    [wylde, flowers]   
86909                      slashfans                       [slash, fans]   

       Count  
0       7762  
1       7533  
2       9715  
3       5422  
4       5809

In [415]:
from sklearn.metrics import jaccard_score
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# Calculate Jaccard similarity between hashtags
def jaccard_similarity(set1, set2):
    if len(set1 | set2) == 0:
        return 0
    return len(set1 & set2) / len(set1 | set2)

similarities = []
hashtags_list = aggregated_df['Hashtags_split'][70000:].tolist()
for i in range(len(hashtags_list)):
    row = []
    for j in range(len(hashtags_list)):
        similarity = jaccard_similarity(set(hashtags_list[i]), set(hashtags_list[j]))
        row.append(similarity)
    similarities.append(row)


similarity_matrix = pd.DataFrame(similarities, columns=aggregated_df['Hashtag'][70000:], index=aggregated_df['Hashtag'][70000:])

  hashtags_list = aggregated_df['Hashtags_split'][70000:].tolist()
  similarity_matrix = pd.DataFrame(similarities, columns=aggregated_df['Hashtag'][70000:], index=aggregated_df['Hashtag'][70000:])


In [416]:
similarity_matrix

Hashtag,phelpstwin,oliverphelps,jamesphelps,tiktoknl,expatsinthenetherlands,tedlassoinamsterdam,neversettle,tedlassobelieve,aqagcseenglish,tedlassobbqsauce,...,cosygame,cosygames,crochetproblems,syeshcraft,procreateapp,smallartisthelp,artsupport,anthonymackieandsebastianstan,wyldeflowers,slashfans
Hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
phelpstwin,1.000000,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
oliverphelps,0.333333,1.000000,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
jamesphelps,0.333333,0.333333,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tiktoknl,0.000000,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
expatsinthenetherlands,0.000000,0.000000,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
smallartisthelp,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
artsupport,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
anthonymackieandsebastianstan,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
wyldeflowers,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# Failed attempt - not in use
from scipy.cluster.hierarchy import linkage, fcluster
import pandas as pd

# Assuming you already have the similarity_matrix DataFrame

# Convert the similarity matrix to a condensed distance matrix
distance_matrix = 1 - similarity_matrix

# Perform hierarchical clustering
linkage_matrix = linkage(distance_matrix, method='average')

# Set a threshold for forming clusters
threshold = 0.99 # Adjust as needed
clusters = fcluster(linkage_matrix, threshold, criterion='distance')

sample_fifty = aggregated_df[:1000]

# Add the cluster labels to the DataFrame
sample_fifty['Cluster'] = clusters

# Display the DataFrame with cluster labels
print(sample_fifty[['Hashtag', 'Cluster']])

In [417]:
import pandas as pd

# Assuming you already have the similarity_matrix DataFrame

# Set a similarity threshold
threshold = 0.35 # Adjust as needed
# 0.49 is good
# 0.1 is bad
# 0.5 is also bad
# 0.3 has cons and pros

# Initialize an empty list to store clusters
clusters = []

# Iterate through the similarity matrix and create clusters
for i in range(len(similarity_matrix)):
    cluster = [similarity_matrix.index[i]]  # Start a new cluster with the current hashtag
    for j in range(i + 1, len(similarity_matrix)):
        if similarity_matrix.iloc[i, j] > threshold:
            cluster.append(similarity_matrix.index[j])  # Add the hashtag to the cluster
    clusters.append(cluster)

sample_fifty = aggregated_df[70000:]

# Add the cluster labels to the DataFrame
sample_fifty['Cluster'] = clusters
# 
# Display the DataFrame with cluster labels
print(sample_fifty[['Hashtag', 'Cluster']])

# Create a DataFrame with cluster labels
# cluster_df = pd.DataFrame({'Hashtag': similarity_matrix.index, 'Cluster': clusters})

# Merge the cluster information with the sample_fifty DataFrame
# sample_fifty = pd.merge(sample_fifty, cluster_df, on='Hashtag', how='left')

# Display the DataFrame with cluster labels
# print(sample_fifty[['Hashtag', 'Cluster']])

                             Hashtag  \
83378                     phelpstwin   
83379                   oliverphelps   
83380                    jamesphelps   
83382                       tiktoknl   
83383         expatsinthenetherlands   
...                              ...   
86904                smallartisthelp   
86905                     artsupport   
86906  anthonymackieandsebastianstan   
86908                   wyldeflowers   
86909                      slashfans   

                                                 Cluster  
83378                                       [phelpstwin]  
83379                                     [oliverphelps]  
83380                                      [jamesphelps]  
83382  [tiktoknl, tiktokswitzerland, tiktokauthor, be...  
83383                           [expatsinthenetherlands]  
...                                                  ...  
86904                                  [smallartisthelp]  
86905                                       [ar

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_fifty['Cluster'] = clusters


In [418]:
# put the count of every row in sample_fifty to zero
sample_fifty['Count'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_fifty['Count'] = 0


In [419]:
sample_fifty.head(10)

Unnamed: 0,Hashtag,Hashtags_split,Count,Cluster
83378,phelpstwin,"[phelps, twin]",0,[phelpstwin]
83379,oliverphelps,"[oliver, phelps]",0,[oliverphelps]
83380,jamesphelps,"[james, phelps]",0,[jamesphelps]
83382,tiktoknl,"[tik, tok, nl]",0,"[tiktoknl, tiktokswitzerland, tiktokauthor, be..."
83383,expatsinthenetherlands,"[ex, pats, netherlands]",0,[expatsinthenetherlands]
83384,tedlassoinamsterdam,"[ted, lasso, amsterdam]",0,"[tedlassoinamsterdam, tedlassobelieve, tedlass..."
83387,neversettle,"[never, settle]",0,[neversettle]
83390,tedlassobelieve,"[ted, lasso, believe]",0,"[tedlassobelieve, tedlassobbqsauce, tedlassoba..."
83391,aqagcseenglish,"[aqa, gcse, english]",0,[aqagcseenglish]
83392,tedlassobbqsauce,"[ted, lasso, bbq, sauce]",0,"[tedlassobbqsauce, tedlassobarbecuesauce, mono..."


In [420]:
for i in range(0,len(sample_fifty['Cluster'].tolist())):
    for j in sample_fifty['Cluster'].tolist()[i]:
        print(j)
        if j in aggregated_df['Hashtag'].tolist():
            count_to_add = aggregated_df[aggregated_df['Hashtag'] == j]['Count'].tolist()[0]
            print(count_to_add)
            # Update the count in sample_fifty DataFrame using iloc
            sample_fifty.iloc[i, sample_fifty.columns.get_loc('Count')] += count_to_add
            print(sample_fifty.iloc[i, sample_fifty.columns.get_loc('Count')])
        


phelpstwin
1
1
oliverphelps
1
1
jamesphelps
1
1
tiktoknl
1
1
tiktokswitzerland
1
2
tiktokauthor
1
3
bestoftiktok2022
1
4
tiktokautos
1
5
tiktokbarber
1
6
tiktokbarbers
1
7
tiktokquestion
1
8
tiktokcrochet
1
9
expatsinthenetherlands
1
1
tedlassoinamsterdam
1
1
tedlassobelieve
1
2
tedlassobbqsauce
1
3
tedlassobarbecuesauce
1
4
monopolytedlasso
1
5
neversettle
1
1
tedlassobelieve
1
1
tedlassobbqsauce
1
2
tedlassobarbecuesauce
1
3
monopolytedlasso
1
4
aqagcseenglish
1
1
tedlassobbqsauce
1
1
tedlassobarbecuesauce
1
2
monopolytedlasso
1
3
tedlassobarbecuesauce
1
1
monopolytedlasso
1
2
winningisnteverything
1
1
leadershipskills
1
1
spacetofeel
1
1
embracingchange
1
1
packluckfifa
1
1
latinamericaswifties
1
1
americanswifties
1
2
angelfromthebloc
1
1
spatsv
1
1
moecolin
1
1
colinmoe
1
2
mosssmh
1
1
gambino
1
1
spice
1
1
spicebag
1
2
dahliamikaelsonedit
1
1
erastourlover
1
1
tserastourpittsburgh
1
2
erastouradvice
1
3
theerastournight1
1
4
theerastourmadrid
1
5
erastourhaul
1
6
wembleyerastour


In [421]:
clusters = sample_fifty[['Hashtag', 'Cluster', 'Count']]
clusters.to_csv('data/hashtags_clusters.csv', index=False)

In [422]:
clusters = pd.read_csv('data/hashtags_clusters.csv')

In [423]:
# order the clusters dataframe by the number of hashtags in each cluster
clusters_asc = pd.read_csv('data/hashtags_clusters.csv')
clusters_desc = pd.read_csv('data/hashtags_clusters.csv')

clusters_asc['Length'] = clusters_asc['Cluster'].str.split(',').str.len()
clusters_desc['Length'] = clusters_desc['Cluster'].str.split(',').str.len()

clusters_asc.sort_values(by=['Length'], inplace=True, ascending=True)
clusters_desc.sort_values(by=['Length'], inplace=True, ascending=False)
clusters_asc.drop(columns=['Length'], inplace=True)
clusters_desc.drop(columns=['Length'], inplace=True)

# clusters.head(10)


In [424]:
# get the row outside of the list
clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace('[', '')
clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace(']', '')
clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace(" ", '')

clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace('[', '')
clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace(']', '')
clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace(" ", '')

  clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace('[', '')
  clusters_desc['Cluster'] = clusters_desc['Cluster'].str.replace(']', '')
  clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace('[', '')
  clusters_asc['Cluster'] = clusters_asc['Cluster'].str.replace(']', '')


In [425]:
indexes_to_drop = []
sub_indexes = []

for index, row in clusters_asc.iterrows():
    mask = clusters_desc['Cluster'].apply(lambda x: set(row['Cluster'].split(',')).issubset(set(x.split(','))))
    mask[index] = False  # Exclude the same index
    matching_clusters = clusters_desc[mask]

    if not matching_clusters.empty:
        print(row['Cluster'])
        print(index)
        sub_indexes.append(matching_clusters['Cluster'].index.tolist()[0])
        indexes_to_drop.append(index)

'ellekennedy'
1882
'alexalvarezedit'
1863
'jonandsansa'
1849
'davidkushnerdaylight'
1855
'damonfunny'
1887
'pomsky'
1915
'klausmikelson'
1917
'hiddlesarmy'
1920
'milesmoralespidermanacrossthespiderverse'
1924
'househightower'
1928
'andrewtateistoxic'
1933
'tomhollandishot'
1935
'bukayosakaedit'
1913
'milesmoralesp'
1893
'ka'
1901
'formula1brasil'
1758
'zquad'
1759
'paulfunny'
1770
'funnystefan'
1771
'dollhairwashing'
1781
'disneyily'
1783
'businessineurope'
1738
'livxhalle'
1743
'goodluckcharliefans'
1816
'sephoraslide'
1822
'redhoodcosplayer'
1828
'odaatnetflix'
1836
'onedayatatimeedit'
1838
'stlouiszoo'
1788
'minaturedolls'
1811
'funkrave'
1796
'barbiefiltercat'
1801
'thriftshoppingfinds'
1806
'lolsurprisedolls'
1809
'lolsupriseminaturecollection'
1810
'yaeyhoodie'
2071
'instagramhusband'
2055
'macbookunboxing'
2054
'warnerbrosstudio'
2052
'clawmachinewin'
2078
'dadfluencer'
2111
'productmusthaves'
2035
'newsreporterbloopers'
2082
'hairremovalhack'
2085
'lashtechlife'
2089
'tvdetails

In [426]:
for index in sub_indexes:
    clusters.iloc[index, clusters.columns.get_loc('Count')] += clusters.iloc[indexes_to_drop[0], clusters.columns.get_loc('Count')]

In [427]:
for index in indexes_to_drop:
    clusters.drop(index, inplace=True)

In [428]:
clusters.to_csv('data/clean_hashtags/hashtags_clusters_clean_70k_to_end.csv', index=False)

In [None]:
#join all the clusters csv files
import glob

path = r'data/clean_hashtags' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None)
    li.append(df)

hashtags_clusters = pd.concat(li, axis=0, ignore_index=True)

hashtags_clusters.to_csv('data/all_hashtags_clusters.csv', index=False)



In [None]:
"""
Hierarchical Clustering
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Perform hierarchical clustering
linkage_matrix = linkage(similarity_matrix, method='ward')

# Create a dendrogram
dendrogram(linkage_matrix, labels=df['Hashtag'].tolist(), orientation='right')
plt.title('Hierarchical Clustering Dendrogram')
plt.show()
"""

"""
Determine Clusters
from scipy.cluster.hierarchy import fcluster

# Determine clusters
threshold = 0.2  # Adjust the threshold as needed
clusters = fcluster(linkage_matrix, threshold, criterion='distance')

# Add the cluster labels to the DataFrame
df['Cluster'] = clusters

# Display the DataFrame with cluster labels
print(df[['Hashtag', 'Cluster']])
"""