In [1]:
# This notebook is for Peem's attempt to fit all models using labelled data.
# Last updated: 18th Feb 2022. 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
pd.options.display.float_format = '{:20,.2f}'.format

# For each keyword, the top results are stored in the dataset. Cycle with loop = 50-47

# Part 1: Cleaning and merging the dataset

In [2]:
df = pd.read_csv("content.csv", sep = ",")
# Downloading the labelled dataset.
df_label = pd.read_csv("label600.csv", sep = ",", encoding="ISO-8859-1")
df_label = df_label.T.set_index(0).T
df_label = df_label.dropna(axis = 1, how = "all")
# Downloading the separate metadata and engagement. 
df_meta = pd.read_csv("metadata.csv", sep = ",")
df_engagement = pd.read_csv("engagement.csv", sep = ",")

# Merge content, metadata, and engagement.
# Use the set operation to avoid column duplicates. 
first = set(df.columns)
second = set(df_meta.columns)
third = set(df_engagement.columns)
second = second - first
third = (third - second) - first
final = list(first) + list(second) + list(third)

# Concatenating all of the dataframe
dftemp = pd.concat([df, df_meta[second], df_engagement[third]], axis = 1)
df = dftemp

In [3]:
df["FleshReadingEase"]

0                     -44.51
1                      44.83
2                      28.18
3                      13.90
4                      -0.24
                ...         
11138                  32.11
11139                  61.92
11140                  59.03
11141                 -53.05
11142                  44.83
Name: FleshReadingEase, Length: 11143, dtype: float64

In [4]:
# TODO: Find how many topics there are, do the connection. 
# DO the weighting. Linked by at least one video.
df["relevantTopicIds"]

0                    ['/m/01k8wb', '/m/098wr', '/m/098wr']
1                                 ['/m/098wr', '/m/098wr']
2                                 ['/m/098wr', '/m/098wr']
3        ['/m/019_rr', '/m/0kt51', '/m/019_rr', '/m/0kt...
4                               ['/m/01k8wb', '/m/01k8wb']
                               ...                        
11138    ['/m/02wbm', '/m/019_rr', '/m/019_rr', '/m/02w...
11139    ['/m/02wbm', '/m/019_rr', '/m/098wr', '/m/098wr']
11140    ['/m/02wbm', '/m/019_rr', '/m/019_rr', '/m/02w...
11141                ['/m/019_rr', '/m/098wr', '/m/098wr']
11142                                         ['/m/098wr']
Name: relevantTopicIds, Length: 11143, dtype: object

In [5]:
# A function to clean the URL in the dataset. 
def get_id(full_link):
    temp = full_link.split("=")
    return temp[1]

# A function to obtain mapping between PEMAT criterion and its value.
def PEMAT_map(df):
    """
    This function accepts the dataframe and returns the dictionary that maps each PEMAT criteria 
    number to its description.
    """
    PEMAT_dict = {}
    for col_name in df.columns.tolist():
        try:
            split = col_name.split(".")
            key = split[0]
            temp = ""
            if len(split) > 1:
                # Concatenate every remaining string
                for i in range(1, len(split)):
                    temp += str(split[i])
            value = temp
            if key != 'This video contain high medical knowledge (0: low; 1: High)':
                PEMAT_dict[key] = value         
        except:
            pass
    PEMAT_dict["info"] = "This video contain high medical knowledge (0: low; 1: High)':"
    return PEMAT_dict

# Test if the dictionary is obtained.
PEMAT_dict = PEMAT_map(df_label)

def greater05(value):
    if value >= 0.5:
        return 1
    return 0

In [6]:
PEMAT_dict

{'URL': '',
 'Title': '',
 'Duration': '',
 '1': ' The material makes its purpose completely evident (0,1)',
 '3': ' The material uses common, everyday language (0,1)',
 '4': ' Medical terms are used only to familiarize audience with the terms When used, medical terms are defined (0,1) ',
 '5': ' The material uses the active voice (0,1)',
 '8': ' The material breaks or \x93chunks\x94 information into short sections (0,1,N/A)',
 '9': ' The material\x92s sections have informative headers (0,1,N/A)',
 '10': ' The material presents information in a logical sequence (0,1) ',
 '11': ' The material provides a summary (0,1,N/A)',
 '13': ' Text on the screen is easy to read (0,1,N/A) ',
 '14': ' The material allows the user to hear the words clearly (eg, not too fast, not garbled) (0,1,N/A)',
 '18': ' The material uses illustrations and photographs that are clear and uncluttered (0,1,N/A) ',
 '19': ' The material uses simple tables with short and clear row and column headings (0,1,N/A) ',
 '20'

In [7]:
# Renaming the column
temp = {}
original_col = df_label.columns.tolist()
assert len(original_col) == len(list(PEMAT_dict.keys()))
for i, replacer in enumerate(list(PEMAT_dict.keys())):
    temp[original_col[i]] = replacer
df_label = df_label.rename(columns = temp)

# Obtaining the URL
df_label["URL"] = df_label.apply(lambda row: get_id(row["URL"]), axis = 1)

# Part 1.1: Checking the distribution of PEMAT labels/medical information

In [8]:
# Now, we have two dataframes: df_label (containing 600 labels), and df (containing 11000 labels).
# We want to calculate actionability and understandability.
actionable = [str(i) for i in [20,21,22,25]]
understandable = [str(i) for i in [1,3,4,5,8,9,10,11,13,14,18,19]]

# Change the type from string to int. 
df_label[actionable] = df_label[actionable].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df_label[understandable] = df_label[understandable].apply(lambda x: pd.to_numeric(x, errors='coerce'))

# Calculate the mean
df_label["action"] = df_label[actionable].mean(axis = 1, skipna = True, numeric_only = True)
df_label["understand"] = df_label[understandable].mean(axis = 1, skipna = True, numeric_only = True)

# Apply indicator function whether the mean is greater than 0.5
df_label["action"] = df_label.apply(lambda row: greater05(row["action"]), axis = 1)
df_label["understand"] = df_label.apply(lambda row: greater05(row["understand"]), axis = 1)

In [9]:
# Perform the matching based on 'URL' in df_label and 'id' in df
tempdf = df_label[["URL", "Duration","info", "action", "understand"]]

# Find all videos in the large dataset with labels. 
# Results: The dataset has some duplicate. 
tempdf = tempdf.set_index("URL")
try:
    df = df.set_index("video_id")
except:
    pass
newdf = tempdf.join(df)

# Dropping duplicate entries and storing the cleaned dataset
try:
    newdf.reset_index(inplace = True)
except:
    pass 
newdf = newdf.rename(columns = {"index":"URL"})
newdf = newdf.drop_duplicates("URL")
df = newdf


In [10]:
# Checking if there's any column with only one value (i.e. if every element is the same, cannot be used.)
to_drop = []
for i, col in enumerate(df.columns):
    if df[col].value_counts().shape[0] == 1: # Only one type, so cannot be used for prediction. 
        to_drop.append(col)
df = df.drop(columns = to_drop)
print("Below are the list of variables I dropped")
for col in to_drop:
    print(col)

Below are the list of variables I dropped
has_tags
has_title
favoriteCount
contentRating.ytRating
audioTrackType
isCC
language
privacyStatus
uploadStatus
contentCaption


In [11]:
# Convert every features whose values can be interpreted as numbers.
for col in newdf.columns.tolist():
    newdf[col] = pd.to_numeric(newdf[col], errors = "ignore")
newdf["info"].value_counts()

 1.00    207
 0.00    109
 8.00     39
11.00     36
 9.00     36
10.00     32
 7.00     27
12.00     27
 6.00     20
14.00     18
 4.00     17
 5.00     14
13.00     14
15.00     12
 2.00      6
 3.00      5
16.00      1
Name: info, dtype: int64

In [13]:
newdf

Unnamed: 0,URL,Duration,info,action,understand,ARI,FleshReadingEase,Kincaid,active_verb,has_description,...,isAutoSynced,negative_comment_count,keyword_decription_cosine,comment_total_words,comment_unique_words,postive_comment_count,comment_description_cosine,keyword_title_cosine,comment_title_cosine,neutral_comment_count
0,-32U9eU1hdM,310,1.00,0,1,14.52,28.51,13.75,59.00,1.00,...,0.00,4.00,0.02,290.00,257.00,14.00,2.53,0.00,0.71,4.00
2,-71ld0iqAq8,602,12.00,0,1,9.06,52.87,8.75,31.00,1.00,...,0.00,0.00,0.10,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,-8HmZjScLe0,732,5.00,0,0,21.83,70.67,10.03,42.00,1.00,...,0.00,5.00,0.00,3603.00,2789.00,90.00,3.22,0.28,2.46,5.00
5,-Aj5BTnz-v0,559,9.00,1,1,15.78,25.99,13.21,23.00,1.00,...,0.00,3.00,0.02,261.00,234.00,9.00,1.23,0.28,0.45,6.00
6,-IaysvX1L8U,246,1.00,0,1,12.51,45.13,11.97,16.00,1.00,...,1.00,1.00,0.04,104.00,95.00,3.00,0.27,0.19,0.11,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,zLbAxztnnfE,163,0.00,0,0,19.30,19.81,16.74,77.00,1.00,...,0.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,0.00,0.00
883,zPdwsCTp4gM,751,11.00,0,1,8.49,65.03,7.58,6.00,1.00,...,0.00,1.00,0.00,70.00,54.00,1.00,0.09,0.47,0.00,0.00
884,zU-5GcqzHNM,819,8.00,1,1,16.95,62.72,12.56,17.00,1.00,...,0.00,36.00,0.14,5239.00,3715.00,47.00,8.14,0.07,3.46,17.00
885,z_3S2_41_FE,300,1.00,0,1,9.27,75.98,7.75,32.00,1.00,...,0.00,29.00,0.18,2684.00,2178.00,38.00,8.46,0.00,5.21,33.00


In [14]:
newdf.to_csv("merged_and_cleaned600.csv")

# List of text and miscellaneous
'captid': ?
'captsLastUpdated': 
'categoryId': Denotes the type of each Youtube video, such as person, blog, science. See Youtube API for more info
'channelCommentCount': Counts how many comments there are. 
'channelDescription': A string of words describing each Youtube video. 
'channelId': Id of the channel posting the Youtube video. Measures how "prolific" a Youtube channel is. 
'channelPublishedat': Time the channel was established. 
'channelSubscriberCount': How many subscribers a channel has.
'channelTitle': Name of the channel (e.g., All about Diabetes and Related). 
'channelVideoCount': How many videos a channel has posted (NOTE: Some channels have 
'channelViewCount': How many views a channel posting that particular video have received. Its value counts denote in this dataset denote the number of videos made by that channel in this dataset. 
'comment': A list of comments made about each video, separated by ,.
'commentCount': How many comments have been made on each video.
'contentCaption': A Boolean variable saying whether a video has caption or not.
'contentDefinition': Consists of two types: hd (high definition) or sd (standard definition). 68% are hd.
'contentDimension': 2d or 3d. Every video except 1 video is in 2d (i.e. DROP).
'contentDuration': Duration of the video, denoted in PT'X'M(Min)'Y'(Sec)
'contentLicensed': Whether the content on that channel is licensed. If a video is licensed, it cannot be used for commercial purposes without the permission of the video creator. 67% of the videos are not licensed.
'description': Description of videos (what you see below the screen). E.g., importance of lipid metabolism. 
'dislikeCount': How many dislikes a video receives.
'embeddable': ? Only 1% false; otherwise all true.
'id': id of each video (copy-paste this onto Youtube). 
'isAutoSynced':  67% false. 1% true. Otherwise NaN.
'keyword': Keyword used to search for that video
'license': Is the video licensed by Youtube or Creative Common? 
'likeCount': How many likes a video receives. 
'publicStatsViewable': 93% True; 6% false.
'publishedAt': Date and time a ivdeo is published. 
'rank': 1-50 cyclic.
'relevantTopicIds': Topic ids created by Youtube based on knowledge graph.
'subtitle': What is being said in the video (i.e., the transcript). Only 500 videos have the caption (unless I'm doing something crazy).
'title': Title of the video
'topicIds':
'trackKind': ASR and standard.
'viewCount':
'word_unique': Number of unique words in the transcript. 
'transition_words':
'video_duration': Duration of the video

# List of understandability index
'summary_words': How many summary words are said. 
 'active_verb', 
 'Kincaid':
 'sentence_count':
 'word_count':
 'FleshReadingEase': 
 'has_description': W
 'ARI', 
 
 # List of Engagement metric
       'comment_title_cosine',
       'postive_comment_count', 'video_id', 'negative_comment_count',
       'comment_description_cosine', 'keyword_title_cosine',
       'comment_unique_words', 'keyword_decription_cosine',
       'comment_total_words', 'neutral_comment_count'],

Unnamed: 0,URL,Title,Duration,1,3,4,5,8,9,10,...,14,18,19,20,21,22,25,info,action,understand
1,sQ_BFNGg0CU,How Glitazones Work,60,1.00,0.00,0.00,0.00,,,1.00,...,1.00,,,0.00,0.00,0.00,,1,0,1
2,yvVYwkV1Buo,What is Persistent Diarrhea?,60,1.00,1.00,1.00,1.00,,,1.00,...,1.00,,,1.00,1.00,0.00,,0,1,1
3,t9o7jGViy4w,DPP4 inhibitors - Overview,61,1.00,0.00,0.00,0.00,0.00,0.00,1.00,...,1.00,,,1.00,1.00,0.00,,0,1,0
4,kvQkyNOv4Rs,Health Update (Beta Mix) - Captain Novolin,61,0.00,1.00,1.00,0.00,0.00,0.00,0.00,...,1.00,,,0.00,0.00,0.00,,0,0,0
5,DUlLEmobwys,Active Kids. Active Adults. - Physical Activity,61,1.00,1.00,1.00,1.00,0.00,0.00,1.00,...,1.00,,,1.00,1.00,0.00,,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,_CUOjdA-VG0,Sugar and the beating heart: the conundrum of ...,3771,1.00,0.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,1.00,0.00,0.00,0.00,,11,0,1
618,WpSSVwKYsv0,Mercodia Webinar Dr Ralph DeFronzo Glucagon Re...,4467,1.00,0.00,0.00,1.00,1.00,1.00,1.00,...,1.00,1.00,,0.00,1.00,0.00,1.00,11,1,1
619,KYQ2dPW5nEU,20. Human Genetics; SNPs; and Genome Wide Asso...,4677,1.00,0.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,,0.00,1.00,0.00,0.00,10,0,1
620,FHjK1dwlpm0,IPPCR 2015: Clinical Research from the Patient...,5498,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,,1.00,1.00,1.00,1.00,15,1,1


# Part 2: Checking PEMAT input per questions

In the above part, I have merged Xiao's datasets and created two versions: merged_and_cleaned with labels (600) and merged_and_cleaned without labels (11000). The remaining task would be to join them by row_id, if need be. However, it's interesting to examine df_label itself to see what the distribution of each questions are.

## Missing values
For most of the columns, there are only few missing entries. The noteworthy columns with high misses are 13, 18, 19, 25, which corresponds to clarity of simple graphs/illustrations/etc. These questions may not be applicable to all diabetes videos. Therefore, there's nothing egregious about missingness. 

## Strange values
Medical information (i.e. 'info') has a lot of non-sense values. 

## Observations
1. 75% of the videos are understandable, 25% are not. A simple rule of outputting 1 (i.e. every video is understandable) will achieve 75% accuracy, so **need to think about alternative metrics**. 

2. Because some PEMAT questions have NaN, I cannot calculate the correlation between each of the response questions.
TODO: Ask Larry/Nynke if there's a need to calculate the correlation.

3. Actionability and understandability have the correlation coefficient of 0.128. This value is very low. 

4. Duration (unit: second) is skewed heavily to the right — there are some very lengthy videos. **Need to log-scale if included in the final model.**

5. PEMAT criteria with mostly zero entries are 8,9,19,22 (check PEMAT_dict). They all correspond to not breaking down information into small-chunks/actionable steps or lack of informative headers. **This could inform why videos we classify as zero are not understandable/actionable**. 

6. "11" (summary) and "25" (graphs/charts to take actions) do not receive good scores. 

In [None]:
# Checking missing values in each column. 
df_label.isna().sum()

In [112]:
# Distribution
df_label["Duration"] = pd.to_numeric(df_label["Duration"],)
numerical_col = df_label.columns.tolist()
numerical_col.remove("Title")
numerical_col.remove("URL")
fare = ["min", "max", "median", "mean"]
temp = {}
for col in numerical_col:
    temp[col] = fare
df_label.agg(temp)

Unnamed: 0,Duration,1,3,4,5,8,9,10,11,13,14,18,19,20,21,22,25,info,action,understand
min,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
max,6502.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0
median,317.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
mean,522.88,0.83,0.78,0.63,0.8,0.18,0.16,0.8,0.32,0.77,0.89,0.8,0.22,0.49,0.52,0.18,0.41,,0.47,0.75


In [114]:
# A code to find the covaraince matrix for each of the PEMAT questions. 
temp = []
for col in  df_label.columns.tolist()[3:19]:
    temp.append(df_label[col].tolist())
temp = np.array(temp)
# TODO: uncomment. np.corrcoef(temp)

# Find the correlation between understand and action
act_under = np.array([df_label["action"].tolist(), df_label["understand"]])
np.corrcoef(act_under)

array([[1.        , 0.12826087],
       [0.12826087, 1.        ]])

In [124]:
PEMAT_dict["25"]

' The material explains how to use the charts, graphs, tables, or diagrams to take actions (0,1,N/A)'

In [51]:
# After checking '13','14'

0
URL           False
Title         False
Duration      False
1              True
3              True
4              True
5              True
8              True
9              True
10             True
11             True
13             True
14             True
18             True
19             True
20             True
21             True
22             True
25             True
info           True
action        False
understand    False
Name: 276, dtype: bool

In [70]:
df_label.columns.tolist()

['URL',
 'Title',
 'Duration',
 '1',
 '3',
 '4',
 '5',
 '8',
 '9',
 '10',
 '11',
 '13',
 '14',
 '18',
 '19',
 '20',
 '21',
 '22',
 '25',
 'info',
 'action',
 'understand']