In [None]:
import pandas as pd

pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Loading data

### Loading whisper transcripted data

In [None]:
df_transcripted_videos = pd.read_csv('../data/processed/transcripted_audios.csv')
print(df_transcripted_videos.shape)
df_transcripted_videos.head()

### Parsed data

In [None]:
df_parsed_data = pd.read_csv('../data/processed/transcripted_processed_videos.csv')
print(df_parsed_data.shape)
df_parsed_data.head()

### Selecting only PT videos

In [None]:
df_transcripted_videos = df_transcripted_videos.loc[df_transcripted_videos['detected_language'] == 'pt']

# Merging DataFrames

In [None]:
df_transcripted_videos = df_transcripted_videos.merge(df_parsed_data, how='left', right_on='id', left_on='audio_id')

## Dealing with NaN

In [None]:
null_values = df_transcripted_videos.isnull().sum().sort_values(ascending=False)
null_values[null_values > 0]

In [None]:
df_transcripted_videos.dropna(subset=['transcription'], inplace=True)
df_transcripted_videos['stickers_on_video'].fillna('', inplace=True)
df_transcripted_videos['author_signature'].fillna('', inplace=True)
df_transcripted_videos['music_url'].fillna('', inplace=True)
df_transcripted_videos['music_author'].fillna('', inplace=True)

## Datetime columns

In [None]:
df_transcripted_videos['create_time'] = pd.to_datetime(df_transcripted_videos['create_time'], unit='s')
df_transcripted_videos['create_time'] = df_transcripted_videos['create_time'].dt.tz_localize('UTC').dt.tz_convert('America/Sao_Paulo')

# Creating new columns

In [None]:
# Column with video URL
df_transcripted_videos['video_url'] = 'https://www.tiktok.com/@' + df_transcripted_videos['author_unique_id'] + '/video/' + df_transcripted_videos['id'].astype(str)

# Filtering columns

In [None]:
df_processed = df_transcripted_videos.loc[:, ['id', 'search_tag', 'create_time', 'desc', 'stickers_on_video', 'hashtags', 'duration', 
                                              'is_duet_enabled', 'duet_from_id', 'music_id', 'music_title', 'music_url', 'music_author', 
                                              'music_is_original_audio', 'digg_count', 'share_count', 'comment_count', 'play_count', 
                                              'author_unique_id', 'author_nickname', 'author_avatar', 'author_signature',
                                              'author_is_verified', 'author_duet_setting', 'author_following_count',
                                              'author_followers_count', 'author_heart_count', 'author_digg_count', 'author_heart',
                                              'video_path', 'video_url', 'transcription', 'video_contains_music', 'video_contains_male',
                                              'video_contains_female', 'video_contains_noise', 'video_contains_no_energy', 'segments',
                                              'total_music_duration', 'percentage_of_video_made_of_music']]
df_processed.head()

In [None]:
print('Difference of columns of processed and transcripted dataset')
print(set(df_transcripted_videos.columns).difference(set(df_processed.columns)))

# Renaming columns

In [None]:
df_processed.rename(columns={'hashtags': 'video_hashtags',
                             'desc': 'video_desc',
                             'duration': 'video_duration_in_sec'},
                    inplace=True, errors='raise')

# Saving DataFrame

In [None]:
df_processed.to_csv('../data/processed/cleaned_transcripted_dataset.csv', index=False)