In [21]:
# This notebook is for Peem's attempt to fit all models using labelled data.
# Last updated: 18th Feb 2022. 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
pd.options.display.float_format = '{:20,.2f}'.format

# For each keyword, the top results are stored in the dataset. Cycle with loop = 50-47

# Part 1: Cleaning and merging the dataset

In [22]:
df = pd.read_csv("content.csv", sep = ",")
# Downloading the labelled dataset.
df_label = pd.read_csv("video_lst.csv", sep = ",", encoding="ISO-8859-1")
df_label = df_label.T.set_index(0).T
df_label = df_label.dropna(axis = 1, how = "all")
# Downloading the separate metadata and engagement. 
df_meta = pd.read_csv("metadata.csv", sep = ",")
df_engagement = pd.read_csv("engagement.csv", sep = ",")

# Merge metadata and engagement.
first = set(df.columns)
second = set(df_meta.columns)
third = set(df_engagement.columns)
second = second - first
third = (third - second) - first
final = list(first) + list(second) + list(third)
dftemp = pd.concat([df, df_meta[second], df_engagement[third]], axis = 1)
df = dftemp

In [23]:
# TODO: Find how many topics there are, do the connection. 
# DO the weighting. Linked by at least one video.
df["relevantTopicIds"]

0                    ['/m/01k8wb', '/m/098wr', '/m/098wr']
1                                 ['/m/098wr', '/m/098wr']
2                                 ['/m/098wr', '/m/098wr']
3        ['/m/019_rr', '/m/0kt51', '/m/019_rr', '/m/0kt...
4                               ['/m/01k8wb', '/m/01k8wb']
                               ...                        
11138    ['/m/02wbm', '/m/019_rr', '/m/019_rr', '/m/02w...
11139    ['/m/02wbm', '/m/019_rr', '/m/098wr', '/m/098wr']
11140    ['/m/02wbm', '/m/019_rr', '/m/019_rr', '/m/02w...
11141                ['/m/019_rr', '/m/098wr', '/m/098wr']
11142                                         ['/m/098wr']
Name: relevantTopicIds, Length: 11143, dtype: object

In [24]:
# A function to clean the URL in the dataset. 
def get_id(full_link):
    temp = full_link.split("=")
    return temp[1]

# A function to obtain mapping between PEMAT criterion and its value.
def PEMAT_map(df):
    """
    This function accepts the dataframe and returns the dictionary that maps each PEMAT criteria 
    number to its description.
    """
    PEMAT_dict = {}
    for col_name in df.columns.tolist():
        try:
            split = col_name.split(".")
            key = split[0]
            temp = ""
            if len(split) > 1:
                # Concatenate every remaining string
                for i in range(1, len(split)):
                    temp += str(split[i])
            value = temp
            if key != 'This video contain high medical knowledge (0: low; 1: High)':
                PEMAT_dict[key] = value         
        except:
            pass
    PEMAT_dict["info"] = "This video contain high medical knowledge (0: low; 1: High)':"
    return PEMAT_dict

# Test if the dictionary is obtained.
PEMAT_dict = PEMAT_map(df_label)

def greater05(value):
    if value >= 0.5:
        return 1
    return 0

In [25]:
PEMAT_dict

{'URL': '',
 'Title': '',
 'Duration': '',
 '1': ' The material makes its purpose completely evident (0,1)',
 '3': ' The material uses common, everyday language (0,1)',
 '4': ' Medical terms are used only to familiarize audience with the terms When used, medical terms are defined (0,1) ',
 '5': ' The material uses the active voice (0,1)',
 '8': ' The material breaks or \x93chunks\x94 information into short sections (0,1,N/A)',
 '9': ' The material\x92s sections have informative headers (0,1,N/A)',
 '10': ' The material presents information in a logical sequence (0,1) ',
 '11': ' The material provides a summary (0,1,N/A)',
 '13': ' Text on the screen is easy to read (0,1,N/A) ',
 '14': ' The material allows the user to hear the words clearly (eg, not too fast, not garbled) (0,1,N/A)',
 '18': ' The material uses illustrations and photographs that are clear and uncluttered (0,1,N/A) ',
 '19': ' The material uses simple tables with short and clear row and column headings (0,1,N/A) ',
 '20'

In [26]:
df_label

Unnamed: 0,URL,Title,Duration,"1. The material makes its purpose completely evident. (0,1)","3. The material uses common, everyday language. (0,1)","4. Medical terms are used only to familiarize audience with the terms. When used, medical terms are defined. (0,1)","5. The material uses the active voice. (0,1)","8. The material breaks or chunks information into short sections (0,1,N/A)","9. The materials sections have informative headers. (0,1,N/A)","10. The material presents information in a logical sequence. (0,1)","11. The material provides a summary. (0,1,N/A)","13. Text on the screen is easy to read. (0,1,N/A)","14. The material allows the user to hear the words clearly (e.g., not too fast, not garbled). (0,1,N/A)","18. The material uses illustrations and photographs that are clear and uncluttered. (0,1,N/A)","19. The material uses simple tables with short and clear row and column headings. (0,1,N/A)","20. The material clearly identifies at least one action the user can take. (0,1)","21. The material addresses the user directly when describing actions. (0,1)","22. The material breaks down any action into manageable, explicit steps. (0,1)","25. The material explains how to use the charts, graphs, tables, or diagrams to take actions. (0,1,N/A)",This video contain high medical knowledge (0: low; 1: High)
1,https://www.youtube.com/watch?v=sQ_BFNGg0CU,How Glitazones Work,60,1,0,0,0,,,1,,,1,,,0,0,0,,1
2,https://www.youtube.com/watch?v=yvVYwkV1Buo,What is Persistent Diarrhea?,60,1,1,1,1,,,1,,,1,,,1,1,0,,0
3,https://www.youtube.com/watch?v=t9o7jGViy4w,DPP4 inhibitors - Overview,61,1,0,0,0,0,0,1,0,,1,,,1,1,0,,0
4,https://www.youtube.com/watch?v=kvQkyNOv4Rs,Health Update (Beta Mix) - Captain Novolin,61,0,1,1,0,0,0,0,0,,1,,,0,0,0,,0
5,https://www.youtube.com/watch?v=DUlLEmobwys,Active Kids. Active Adults. - Physical Activity,61,1,1,1,1,0,0,1,0,,1,,,1,1,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,https://www.youtube.com/watch?v=_CUOjdA-VG0,Sugar and the beating heart: the conundrum of ...,3771,1,0,1,1,1,1,1,1,1,1,1,1,0,0,0,,11
618,https://www.youtube.com/watch?v=WpSSVwKYsv0,Mercodia Webinar Dr Ralph DeFronzo Glucagon Re...,4467,1,0,0,1,1,1,1,1,1,1,1,,0,1,0,1,11
619,https://www.youtube.com/watch?v=KYQ2dPW5nEU,20. Human Genetics; SNPs; and Genome Wide Asso...,4677,1,0,1,1,1,1,1,0,1,1,1,,0,1,0,0,10
620,https://www.youtube.com/watch?v=FHjK1dwlpm0,IPPCR 2015: Clinical Research from the Patient...,5498,1,1,1,1,1,1,1,1,1,1,1,,1,1,1,1,15


In [27]:
# Renaming the column
temp = {}
original_col = df_label.columns.tolist()
assert len(original_col) == len(list(PEMAT_dict.keys()))
for i, replacer in enumerate(list(PEMAT_dict.keys())):
    temp[original_col[i]] = replacer
df_label = df_label.rename(columns = temp)

# Obtaining the URL
df_label["URL"] = df_label.apply(lambda row: get_id(row["URL"]), axis = 1)

# Part 1.1: Checking the distribution of PEMAT labels/medical information

In [28]:
# Now, we have two dataframes: df_label (containing 600 labels), and df (containing 11000 labels).
# We want to calculate actionability and understandability.
actionable = [str(i) for i in [20,21,22,25]]
understandable = [str(i) for i in [1,3,4,5,8,9,10,11,13,14,18,19]]

# Change the type from string to int. 
df_label[actionable] = df_label[actionable].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df_label[understandable] = df_label[understandable].apply(lambda x: pd.to_numeric(x, errors='coerce'))

# Calculate the mean
df_label["action"] = df_label[actionable].mean(axis = 1, skipna = True, numeric_only = True)
df_label["understand"] = df_label[understandable].mean(axis = 1, skipna = True, numeric_only = True)

# Apply indicator function whether the mean is greater than 0.5
df_label["action"] = df_label.apply(lambda row: greater05(row["action"]), axis = 1)
df_label["understand"] = df_label.apply(lambda row: greater05(row["understand"]), axis = 1)

In [29]:
# Perform the matching based on 'URL' in df_label and 'id' in df
tempdf = df_label[["URL", "Duration","info", "action", "understand"]]

# Find all videos in the large dataset with labels. 
# Results: The dataset has some duplicate. 
tempdf = tempdf.set_index("URL")
try:
    df = df.set_index("video_id")
except:
    pass
newdf = tempdf.join(df)

# Dropping duplicate entries and storing the cleaned dataset
try:
    newdf.reset_index(inplace = True)
except:
    pass 
newdf = newdf.rename(columns = {"index":"URL"})
newdf = newdf.drop_duplicates("URL")
df = newdf


In [30]:
# Checking if there's any column with only one value (i.e. if every element is the same, cannot be used.)
to_drop = []
for i, col in enumerate(df.columns):
    if df[col].value_counts().shape[0] == 1: # Only one type, so cannot be used for prediction. 
        to_drop.append(col)
df = df.drop(columns = to_drop)
print("Below are the list of variables I dropped")
for col in to_drop:
    print(col)

Below are the list of variables I dropped
has_tags
has_title
language
contentRating.ytRating
privacyStatus
audioTrackType
contentCaption
uploadStatus
favoriteCount
isCC


In [31]:
# Convert every features whose values can be interpreted as numbers.
for col in newdf.columns.tolist():
    newdf[col] = pd.to_numeric(newdf[col], errors = "ignore")
newdf["info"].value_counts()

 1.00    207
 0.00    109
 8.00     39
11.00     36
 9.00     36
10.00     32
 7.00     27
12.00     27
 6.00     20
14.00     18
 4.00     17
 5.00     14
13.00     14
15.00     12
 2.00      6
 3.00      5
16.00      1
Name: info, dtype: int64

In [32]:
newdf.to_csv("merged_and_cleaned.csv")

# List of text and miscellaneous
'captid': ?
'captsLastUpdated': 
'categoryId': Denotes the type of each Youtube video, such as person, blog, science. See Youtube API for more info
'channelCommentCount': Counts how many comments there are. 
'channelDescription': A string of words describing each Youtube video. 
'channelId': Id of the channel posting the Youtube video. Measures how "prolific" a Youtube channel is. 
'channelPublishedat': Time the channel was established. 
'channelSubscriberCount': How many subscribers a channel has.
'channelTitle': Name of the channel (e.g., All about Diabetes and Related). 
'channelVideoCount': How many videos a channel has posted (NOTE: Some channels have 
'channelViewCount': How many views a channel posting that particular video have received. Its value counts denote in this dataset denote the number of videos made by that channel in this dataset. 
'comment': A list of comments made about each video, separated by ,.
'commentCount': How many comments have been made on each video.
'contentCaption': A Boolean variable saying whether a video has caption or not.
'contentDefinition': Consists of two types: hd (high definition) or sd (standard definition). 68% are hd.
'contentDimension': 2d or 3d. Every video except 1 video is in 2d (i.e. DROP).
'contentDuration': Duration of the video, denoted in PT'X'M(Min)'Y'(Sec)
'contentLicensed': Whether the content on that channel is licensed. If a video is licensed, it cannot be used for commercial purposes without the permission of the video creator. 67% of the videos are not licensed.
'description': Description of videos (what you see below the screen). E.g., importance of lipid metabolism. 
'dislikeCount': How many dislikes a video receives.
'embeddable': ? Only 1% false; otherwise all true.
'id': id of each video (copy-paste this onto Youtube). 
'isAutoSynced':  67% false. 1% true. Otherwise NaN.
'keyword': Keyword used to search for that video
'license': Is the video licensed by Youtube or Creative Common? 
'likeCount': How many likes a video receives. 
'publicStatsViewable': 93% True; 6% false.
'publishedAt': Date and time a ivdeo is published. 
'rank': 1-50 cyclic.
'relevantTopicIds': Topic ids created by Youtube based on knowledge graph.
'subtitle': What is being said in the video (i.e., the transcript). Only 500 videos have the caption (unless I'm doing something crazy).
'title': Title of the video
'topicIds':
'trackKind': ASR and standard.
'viewCount':
'word_unique': Number of unique words in the transcript. 
'transition_words':
'video_duration': Duration of the video

# List of understandability index
'summary_words': How many summary words are said. 
 'active_verb', 
 'Kincaid':
 'sentence_count':
 'word_count':
 'FleshReadingEase': 
 'has_description': W
 'ARI', 
 
 # List of Engagement metric
       'comment_title_cosine',
       'postive_comment_count', 'video_id', 'negative_comment_count',
       'comment_description_cosine', 'keyword_title_cosine',
       'comment_unique_words', 'keyword_decription_cosine',
       'comment_total_words', 'neutral_comment_count'],

# Part 2: Building prediction models

In [33]:
# Now, we have cleaned dataset with all the labels.
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

X = df[['summary_words', 'active_verb', 'Kincaid','sentence_count','word_count','FleshReadingEase','has_description',
 'ARI']].fillna(method = "bfill")
X = df["ARI"].fillna(method = "bfill")
y = df["understand"]
# info is dubious (it should be Boolean, but it's not). Actionability and understandability are clean.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [34]:
model = sm.Logit(y_train, X_train)
result = model.fit(method = "Newton")
result.summary()

Optimization terminated successfully.
         Current function value: 0.632771
         Iterations 6


0,1,2,3
Dep. Variable:,understand,No. Observations:,496.0
Model:,Logit,Df Residuals:,495.0
Method:,MLE,Df Model:,0.0
Date:,"Thu, 24 Feb 2022",Pseudo R-squ.:,-0.1533
Time:,20:24:06,Log-Likelihood:,-313.85
converged:,True,LL-Null:,-272.13
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ARI,0.0448,0.007,6.878,0.000,0.032,0.058


In [None]:
# TODO: Find variables to fit with logistic regression
def showHist(df, col_name):
    plt.plot

In [16]:
newdf["Duration"].tolist()

[310,
 602,
 732,
 559,
 246,
 103,
 188,
 81,
 86,
 83,
 311,
 310,
 242,
 806,
 98,
 92,
 71,
 105,
 61,
 1190,
 704,
 781,
 181,
 155,
 176,
 268,
 113,
 732,
 513,
 299,
 707,
 124,
 371,
 94,
 73,
 702,
 275,
 321,
 237,
 297,
 148,
 169,
 899,
 504,
 402,
 602,
 566,
 112,
 141,
 103,
 68,
 77,
 200,
 301,
 273,
 521,
 401,
 3361,
 441,
 362,
 756,
 562,
 548,
 411,
 71,
 517,
 97,
 2715,
 332,
 294,
 284,
 151,
 124,
 155,
 709,
 1674,
 76,
 74,
 205,
 113,
 3372,
 601,
 334,
 201,
 406,
 146,
 2168,
 403,
 297,
 262,
 151,
 228,
 238,
 238,
 292,
 225,
 736,
 561,
 95,
 272,
 288,
 262,
 722,
 208,
 296,
 495,
 610,
 228,
 620,
 312,
 296,
 81,
 317,
 98,
 3607,
 589,
 1864,
 345,
 765,
 162,
 294,
 892,
 610,
 476,
 415,
 123,
 61,
 158,
 988,
 346,
 564,
 81,
 199,
 89,
 166,
 3129,
 3495,
 130,
 183,
 624,
 655,
 136,
 583,
 275,
 5498,
 283,
 104,
 96,
 191,
 684,
 115,
 607,
 262,
 137,
 144,
 779,
 196,
 67,
 329,
 249,
 553,
 457,
 1594,
 142,
 501,
 557,
 210,
 288,
 19

In [35]:
df_label

Unnamed: 0,URL,Title,Duration,1,3,4,5,8,9,10,...,14,18,19,20,21,22,25,info,action,understand
1,sQ_BFNGg0CU,How Glitazones Work,60,1.00,0.00,0.00,0.00,,,1.00,...,1.00,,,0.00,0.00,0.00,,1,0,1
2,yvVYwkV1Buo,What is Persistent Diarrhea?,60,1.00,1.00,1.00,1.00,,,1.00,...,1.00,,,1.00,1.00,0.00,,0,1,1
3,t9o7jGViy4w,DPP4 inhibitors - Overview,61,1.00,0.00,0.00,0.00,0.00,0.00,1.00,...,1.00,,,1.00,1.00,0.00,,0,1,0
4,kvQkyNOv4Rs,Health Update (Beta Mix) - Captain Novolin,61,0.00,1.00,1.00,0.00,0.00,0.00,0.00,...,1.00,,,0.00,0.00,0.00,,0,0,0
5,DUlLEmobwys,Active Kids. Active Adults. - Physical Activity,61,1.00,1.00,1.00,1.00,0.00,0.00,1.00,...,1.00,,,1.00,1.00,0.00,,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,_CUOjdA-VG0,Sugar and the beating heart: the conundrum of ...,3771,1.00,0.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,1.00,0.00,0.00,0.00,,11,0,1
618,WpSSVwKYsv0,Mercodia Webinar Dr Ralph DeFronzo Glucagon Re...,4467,1.00,0.00,0.00,1.00,1.00,1.00,1.00,...,1.00,1.00,,0.00,1.00,0.00,1.00,11,1,1
619,KYQ2dPW5nEU,20. Human Genetics; SNPs; and Genome Wide Asso...,4677,1.00,0.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,,0.00,1.00,0.00,0.00,10,0,1
620,FHjK1dwlpm0,IPPCR 2015: Clinical Research from the Patient...,5498,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,,1.00,1.00,1.00,1.00,15,1,1
