In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
df_cleaned_transcripted = pd.read_csv('../data/processed/cleaned_transcripted_dataset.csv')
df_cleaned_transcripted.head()

# S2T statistics

In [None]:
videos_tokens = []
videos_lenght = []

for video in df_cleaned_transcripted['transcription']:
    video = str.lower(video)
    videos_lenght.append(len(set(video.split())))
    videos_tokens.append(len(video.split()))

In [None]:
print('Average Tokens per video:', np.mean(videos_tokens))
print('Median Tokens per video:', np.median(videos_tokens))
print()
print('Average Unique Tokens per video:', np.mean(videos_lenght))
print('Median Unique Tokens per video:', np.median(videos_lenght))

In [None]:
print('First video was created at:', df_cleaned_transcripted['create_time'].min())
print('Last video was created at:', df_cleaned_transcripted['create_time'].max())

# Commonly used hashtags
What hashtags are used along with others?

In [None]:
df_hashtags = df_cleaned_transcripted.loc[:, ['video_hashtags']].fillna('')
for idx in range(len(df_hashtags)):
    row = df_hashtags.loc[idx]
    for hashtag in row['video_hashtags'].split(','):
        if len(hashtag) == 0:
            continue
        hashtag = hashtag.strip()
        df_hashtags.loc[idx, hashtag] = 1
        
df_hashtags = df_hashtags.drop('video_hashtags', axis=1)
df_hashtags = df_hashtags.fillna(0)
df_hashtags = df_hashtags.astype(int)
df_hashtags.head()

## Co-occurence matrix

In [None]:
occurence_matrix = df_hashtags.T.dot(df_hashtags)
occurence_matrix['Total'] = occurence_matrix.sum(axis=0)
occurence_matrix = occurence_matrix.sort_values(by='Total', ascending=False)
occurence_matrix.head(20)

## Selecting K most used hashtags

In [None]:
# Removing TikTok's control hashtag
occurence_matrix.drop(['fy', 'fypシ', 'fyp', 'foryou', 'viral', 'foryoupage'], axis=0, inplace=True)
occurence_matrix.drop(['fy', 'fypシ', 'fyp', 'foryou', 'viral', 'foryoupage'], axis=1, inplace=True)

In [None]:
K = 30
most_used_k_tags = occurence_matrix.iloc[:K]

most_used_k_tags_index = most_used_k_tags.index.tolist() + ['Total']

most_used_k_tags = most_used_k_tags.append(most_used_k_tags.sum(numeric_only=True), ignore_index=True) # Adding a 'Total' row at the end

most_used_k_tags.index = most_used_k_tags_index

most_common_k_tags = most_used_k_tags.T\
                                     .sort_values(by='Total', axis=0, ascending=False)\
                                     .loc[most_used_k_tags_index]\
                                     .index

most_common_tags = most_used_k_tags[most_common_k_tags].drop('Total', axis=1)
most_common_tags = most_common_tags.loc[most_common_k_tags].drop('Total')
index_sort = np.argsort(np.diag(most_common_tags))[::-1]
most_common_tags = most_common_tags.iloc[index_sort, index_sort]

## Generating Heatmap

In [None]:
# Removing diagonal values
most_common_tags_labels = most_common_tags.columns
most_common_tags = most_common_tags.values.astype(float)
most_common_tags[np.diag_indices_from(most_common_tags)] = np.NaN

In [None]:
# Removing upper diagonal values
mask = np.zeros_like(most_common_tags, dtype='bool')
mask[np.triu_indices_from(mask)] = True

In [None]:
plt.figure(figsize=(17,7))
sns.heatmap(most_common_tags, 
            annot=True, 
            cmap='Reds', 
            fmt='.0f', 
            vmax=600, 
            xticklabels=most_common_tags_labels, 
            yticklabels=most_common_tags_labels, 
            mask=mask)
plt.show()

# Pairwise analysis
How pairs of hashtags correlates with the topic assigned by BTM?

In [None]:
predictions_df = pd.read_csv('../data/processed/predictions_btm_whisper.csv')
predictions_df.head()

In [None]:
pairwise_lst = []
pairwise_toxicity_df = pd.DataFrame()
parsed_hashtags = []
for pair in most_frequent_pairs:
    pair = pair.split('/')
    pair.sort()
    pair_hashtag = '/'.join(pair)
    a, b  = pair
    
    if pair_hashtag in parsed_hashtags:
        continue
        
    pair_data = predictions_df.loc[
        (predictions_df['video_hashtags'].str.contains(r'(\b|^){}(\b|$)'.format(a))) &
        (predictions_df['video_hashtags'].str.contains(r'(\b|^){}(\b|$)'.format(b)))]
    
    pair_dict = {
        'pair_hashtag': pair_hashtag,
        'total_videos': len(pair_data),
        'perc_total': len(pair_data) * 100 / len(df_cleaned_transcripted),
        'perc_toxic_videos': pair_data['is_toxic'].sum() * 100 / len(pair_data)
    }
    
    top_topics = pair_data['topic_btm'].value_counts().index[:3].values
    pair_dict['top_topics'] = ', '.join(top_topics)
    
    pairwise_lst.append(pair_dict)
    parsed_hashtags.append(pair_hashtag)

In [None]:
pairwise_w_toxicity_df = pd.DataFrame.from_dict(pairwise_lst).drop_duplicates()
pairwise_w_toxicity_df.head()

In [None]:
import plotly.express as px
# Percentage of most toxic hashtags
top_10_toxic_hashtags = pairwise_w_toxicity_df\
                            .sort_values('perc_toxic_videos', ascending=False)\
                            .head(10)\
                            .sort_values('perc_toxic_videos')
fig = px.bar(top_10_toxic_hashtags, y='pair_hashtag', x='perc_toxic_videos', color_discrete_sequence=['#c0392b'])
fig.update_layout(
    #title='<b>Top 10 toxic hashtag pairs</b><br>Percentage of toxic videos in most frequent hashtag pairs',
    xaxis=dict(title='<b>% of toxic videos</b>'),
    yaxis=dict(title='<b>Hashtag pair</b>')
)
fig.show()

In [None]:
fig = px.bar((predictions_df.groupby('topic_btm')['is_toxic'].sum() * 100 / predictions_df.groupby('topic_btm')['id'].count()).sort_values(ascending=True).reset_index(), 
             y='topic_btm', 
             x=0,
             color_discrete_sequence=['#c0392b'])
fig.update_layout(
    #title='<b>Percentage of toxic videos per topic<b>',
    xaxis=dict(title='<b>% of toxic videos</b>'),
    yaxis=dict(title='<b>Topic</b>')
                 )
fig.show()

In [None]:
predictions_df.loc[:, 'video_hashtags'] = predictions_df['video_hashtags'].str.replace('lefttiktok', 'leftiktok')
predictions_df.loc[:, 'video_hashtags'] = predictions_df['video_hashtags'].str.replace('righttiktok', 'rightiktok')

In [None]:
def is_valid_hashtag(hashtag):
    hashtag = hashtag.strip()
    ignore_hashtags = ['fyp', 'fy', 'foryou', 'foryoupage', 'viral', '']
    
    if hashtag in ignore_hashtags:
        return False
    
    if len(hashtag1) == 0:
        return False
    
    return True

pairwise_dfs = []
top_pairwise_keys = set()

for topic in predictions_df['topic_btm'].unique():
    topic_pairwise_hashtag_dict = {}
    print(topic)
    topic_data = predictions_df.loc[predictions_df['topic_btm'] == topic].dropna()
    
    for video_idx in range(len(topic_data)):
        video = topic_data.iloc[video_idx]
        video_hashtags = video['video_hashtags'].split(',')
        
        for hashtag1 in video_hashtags:
            if is_valid_hashtag(hashtag1):
                for hashtag2 in video_hashtags:
                    if is_valid_hashtag(hashtag2):
                        if hashtag1 != hashtag2:
                            hashtag_pair = [hashtag1.strip(), hashtag2.strip()]
                            hashtag_pair.sort()

                            hashtag_pair = '/'.join(hashtag_pair)
                            topic_pairwise_hashtag_dict[hashtag_pair] = topic_pairwise_hashtag_dict.get(hashtag_pair, 0) + .5

    # Sorting, selecting the top 10 pairs, and appending the df to the list to be concatenated
    sorted_topic_pairwise_hashtag_dict = dict(sorted(topic_pairwise_hashtag_dict.items(),key=lambda x:x[1],reverse = True))
    pairwise_df = pd.DataFrame.from_dict(sorted_topic_pairwise_hashtag_dict, orient='index')
    pairwise_df_index = pairwise_df.iloc[:10]
    display(pairwise_df_index)
    top_pairwise_keys.update(pairwise_df_index.index)
    
    pairwise_df.columns = [topic]
    pairwise_dfs.append(pairwise_df)