In [1]:
# This notebook is for Peem's attempt to fit all models using labelled data.
# Last updated: 18th Feb 2022. 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
pd.options.display.float_format = '{:20,.2f}'.format

# Executive Summary

This Jupyter notebook is separated into two parts:
1. Merging and cleaning multiple datasets. Each of the PEMAT labels are used to calculated labels on understandability, actionability, and medical information (already exists). 
2. Created a dictionary of PEMAT Question (Key) and its definition (Value).  
3. Performing EDA on 600 videos with PEMAT labels (see description in Part 2).

NOTE: I didn't remove entry 275 (URL = 'RQCq-FzeYgA') even though its PEMAT label is entirely NaN. Make sure to remove it in the final analysis. 

# Part 1: Cleaning and merging the dataset

In [2]:
df = pd.read_csv("content.csv", sep = ",")
# Downloading the labelled dataset.
df_label = pd.read_csv("label600.csv", sep = ",", encoding="ISO-8859-1")
df_label = df_label.T.set_index(0).T
df_label = df_label.dropna(axis = 1, how = "all")
# Downloading the separate metadata and engagement. 
df_meta = pd.read_csv("metadata.csv", sep = ",")
df_engagement = pd.read_csv("engagement.csv", sep = ",")

# Merge content, metadata, and engagement.
# Use the set operation to avoid column duplicates. 
first = set(df.columns)
second = set(df_meta.columns)
third = set(df_engagement.columns)
second = second - first
third = (third - second) - first
final = list(first) + list(second) + list(third)

# Concatenating all of the dataframe
dftemp = pd.concat([df, df_meta[second], df_engagement[third]], axis = 1)
df = dftemp

In [3]:
# TODO: Find how many topics there are, do the connection. 
# DO the weighting. Linked by at least one video.
df["relevantTopicIds"]

0                    ['/m/01k8wb', '/m/098wr', '/m/098wr']
1                                 ['/m/098wr', '/m/098wr']
2                                 ['/m/098wr', '/m/098wr']
3        ['/m/019_rr', '/m/0kt51', '/m/019_rr', '/m/0kt...
4                               ['/m/01k8wb', '/m/01k8wb']
                               ...                        
11138    ['/m/02wbm', '/m/019_rr', '/m/019_rr', '/m/02w...
11139    ['/m/02wbm', '/m/019_rr', '/m/098wr', '/m/098wr']
11140    ['/m/02wbm', '/m/019_rr', '/m/019_rr', '/m/02w...
11141                ['/m/019_rr', '/m/098wr', '/m/098wr']
11142                                         ['/m/098wr']
Name: relevantTopicIds, Length: 11143, dtype: object

In [4]:
# A function to clean the URL in the dataset. 
def get_id(full_link):
    temp = full_link.split("=")
    return temp[1]

# A function to obtain mapping between PEMAT criterion and its value.
def PEMAT_map(df):
    """
    This function accepts the dataframe and returns the dictionary that maps each PEMAT criteria 
    number to its description.
    """
    PEMAT_dict = {}
    for col_name in df.columns.tolist():
        try:
            split = col_name.split(".")
            key = split[0]
            temp = ""
            if len(split) > 1:
                # Concatenate every remaining string
                for i in range(1, len(split)):
                    temp += str(split[i])
            value = temp
            if key != 'This video contain high medical knowledge (0: low; 1: High)':
                PEMAT_dict[key] = value         
        except:
            pass
    PEMAT_dict["info"] = "This video contain high medical knowledge (0: low; 1: High)':"
    return PEMAT_dict

# Test if the dictionary is obtained.
PEMAT_dict = PEMAT_map(df_label)

def greater05(value):
    if value >= 0.5:
        return 1
    return 0

In [5]:
# Saving the dictionary
import pickle 
d_file = open("PEMAT_dict.pkl", "wb")
pickle.dump(PEMAT_dict, d_file)
d_file.close()

In [6]:
d_file

<_io.BufferedWriter name='PEMAT_dict.pkl'>

In [7]:
d_file = open("PEMAT_dict.pkl", "rb")
output = pickle.load(d_file)

In [8]:
# Renaming the column
temp = {}
original_col = df_label.columns.tolist()
assert len(original_col) == len(list(PEMAT_dict.keys()))
for i, replacer in enumerate(list(PEMAT_dict.keys())):
    temp[original_col[i]] = replacer
df_label = df_label.rename(columns = temp)

# Obtaining the URL
df_label["URL"] = df_label.apply(lambda row: get_id(row["URL"]), axis = 1)

# Part 1.1: Checking the distribution of PEMAT labels/medical information

In [9]:
# Now, we have two dataframes: df_label (containing 600 labels), and df (containing 11000 labels).
# We want to calculate actionability and understandability.
actionable = [str(i) for i in [20,21,22,25]]
understandable = [str(i) for i in [1,3,4,5,8,9,10,11,13,14,18,19]]

# Change the type from string to int. 
df_label[actionable] = df_label[actionable].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df_label[understandable] = df_label[understandable].apply(lambda x: pd.to_numeric(x, errors='coerce'))

# Calculate the mean
df_label["action"] = df_label[actionable].mean(axis = 1, skipna = True, numeric_only = True)
df_label["understand"] = df_label[understandable].mean(axis = 1, skipna = True, numeric_only = True)

# Apply indicator function whether the mean is greater than 0.5
df_label["action"] = df_label.apply(lambda row: greater05(row["action"]), axis = 1)
df_label["understand"] = df_label.apply(lambda row: greater05(row["understand"]), axis = 1)

In [10]:
# Checking if there's any column with only one value (i.e. if every element is the same, cannot be used.)
to_drop = []
for i, col in enumerate(df.columns):
    if df[col].value_counts().shape[0] == 1: # Only one type, so cannot be used for prediction. 
        to_drop.append(col)
df = df.drop(columns = to_drop)
print("Below are the list of variables I dropped")
for col in to_drop:
    print(col)

Below are the list of variables I dropped
has_tags
has_title
audioTrackType
privacyStatus
favoriteCount
isCC
contentRating.ytRating
uploadStatus
language


In [11]:
# After removing all columns with one input, I saved the 12k without labels into a dataframe.
df.to_csv("merged_and_cleaned12k.csv")

# Part 1.3: Merging videos with PEMAT labels with overall 12k.
In this part, I merge df (12k) with df_label (600) based on URL. Afterwards, we will obtain a dataset with 600 labelled videos along with all of its information.


In [12]:
# Perform the matching based on 'URL' in df_label and 'id' in df.

tempdf = df_label[["URL", "Duration","info", "action", "understand"]]

# Find all videos in the large dataset with labels. 
# Results: The dataset has some duplicate. 
tempdf = tempdf.set_index("URL")
try:
    df = df.set_index("video_id")
except:
    pass
newdf = tempdf.join(df)

# Dropping duplicate entries and storing the cleaned dataset
try:
    newdf.reset_index(inplace = True)
except:
    pass 
newdf = newdf.rename(columns = {"index":"URL"})
newdf = newdf.drop_duplicates("URL")
df = newdf


In [13]:
# Convert every features whose values can be interpreted as numbers.
for col in newdf.columns.tolist():
    newdf[col] = pd.to_numeric(newdf[col], errors = "ignore")
newdf["info"].value_counts()



1.00    421
0.00    199
Name: info, dtype: int64

In [14]:
df.to_csv("merged_and_cleaned600.csv")

In [24]:
df["ARI"]

0                    14.52
2                     9.06
4                    21.83
5                    15.78
6                    12.51
              ...         
882                  19.30
883                   8.49
884                  16.95
885                   9.27
886                  20.22
Name: ARI, Length: 621, dtype: float64

# Part 2: Checking PEMAT input per questions

In the above part, I have merged Xiao's datasets and created two versions: merged_and_cleaned with labels (600) and merged_and_cleaned without labels (11000). The remaining task would be to join them by row_id, if need be. However, it's interesting to examine df_label itself to see what the distribution of each questions are.

## Missing values
For most of the columns, there are only few missing entries. The noteworthy columns with high misses are 13, 18, 19, 25, which corresponds to clarity of simple graphs/illustrations/etc. These questions may not be applicable to all diabetes videos. Therefore, there's nothing egregious about missingness. 

## Strange values
Medical information (i.e. 'info') has a lot of non-sense values. 

## Observations
1. 75% of the videos are understandable (1-19), 25% are not. A simple rule of outputting 1 (i.e. every video is understandable) will achieve 75% accuracy, so **need to think about alternative metrics**. 47% of videos are actionable (corresponding to Question 20,21,22,25).

2. Because some PEMAT questions have NaN, I cannot calculate the correlation between each of the response questions.
TODO: Ask Larry/Nynke if there's a need to calculate the correlation.

3. Actionability and understandability have the correlation coefficient of 0.128. This value is very low. 

4. Duration (unit: second) is skewed heavily to the right — there are some very lengthy videos. **Need to log-scale if included in the final model.**

5. PEMAT criteria with mostly zero entries are 8,9,19,22 (check PEMAT_dict). They all correspond to not breaking down information into small-chunks/actionable steps or lack of informative headers. **This could inform why videos we classify as zero are not understandable/actionable**. 

6. "11" (summary) and "25" (graphs/charts to take actions) do not receive good scores. 

In [15]:
# Checking missing values of PEMAT response in each column. 
df_label.isna().sum()

0
URL             0
Title           0
Duration        0
1               1
3               1
4               1
5               1
8               4
9               5
10              1
11              3
13            231
14             38
18            200
19            387
20              2
21              2
22              2
25            342
info            1
action          0
understand      0
dtype: int64

In [16]:
# Distribution
df_label["Duration"] = pd.to_numeric(df_label["Duration"],)
numerical_col = df_label.columns.tolist()
numerical_col.remove("Title")
numerical_col.remove("URL")
fare = ["min", "max", "median", "mean"]
temp = {}
for col in numerical_col:
    temp[col] = fare
df_label.agg(temp)

Unnamed: 0,Duration,1,3,4,5,8,9,10,11,13,14,18,19,20,21,22,25,info,action,understand
min,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
max,6502.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0
median,317.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
mean,522.88,0.83,0.78,0.63,0.8,0.18,0.16,0.8,0.32,0.77,0.89,0.8,0.22,0.49,0.52,0.18,0.41,,0.47,0.75


In [17]:
# A code to find the covaraince matrix for each of the PEMAT questions. 
temp = []
for col in  df_label.columns.tolist()[3:19]:
    temp.append(df_label[col].tolist())
temp = np.array(temp)
# TODO: uncomment. np.corrcoef(temp)

# Find the correlation between understand and action
act_under = np.array([df_label["action"].tolist(), df_label["understand"]])
np.corrcoef(act_under)

array([[1.        , 0.12826087],
       [0.12826087, 1.        ]])

In [26]:
# Finding the location where info label is unavailable.
np.where(df_label["info"].isna() == True)
df = df.drop(index = [275])

In [49]:
# Rename the dataframe
try:
    readability_df = readability_df.set_index("video_id")
    df = df.set_index("URL")
except:
    pass

# Re-assign readability indices
for index in df.index.tolist():
    if index in readability_df.index.tolist():
        df.loc[index, "ARI"] = readability_df.loc[index, "ari"] 
        df.loc[index, "FleshReadingEase"] = readability_df.loc[index, "flesch"]
        df.loc[index, "Kincaid"] = readability_df.loc[index, "kincaid"] 

In [67]:
# Save all files
df_label.to_csv("rawPEMAT.csv")
df.to_csv("merged_and_cleaned600.csv")