# Import Data

In [113]:
import pandas as pd

In [114]:
# read in the data
data_path = 'data/table.csv'
data = pd.read_csv(data_path, sep = ',')

In [115]:
data.head()

Unnamed: 0,Dataset,Source of Content,Target Issue,C1,C2,C3,C4,C5-a,C5-b,C5-c,...,C5-g,C5-h,C5-i,C6 - a,C6 - b,C6 - c,C6-d,C6-e,C7-a,C7-b
0,Chinese Multimodal Depression Corpus (CMDC),CI,DP,Yes,No,Yes,Yes,Yes,Yes,Yes,...,No,Yes,No,Yes,Yes,Yes,Yes,No,Yes,Yes
1,EMU,"CS, SR, SS","AX, DP",Yes,Yes,Yes,Yes,No,No,No,...,No,No,Yes,No,Yes,No,No,No,Yes,Yes
2,Moodable,"CS, SR",DP,Yes,Yes,No,No,No,No,No,...,No,No,Yes,No,No,No,No,No,No,No
3,Jiang et al.,CI,DP,No,No,No,Yes,Yes,Yes,Yes,...,No,No,No,No,No,Yes,No,No,Yes,No
4,Lin et al.,CI,"AX, DP",Yes,No,No,No,Yes,Yes,Yes,...,No,Yes,Yes,No,No,No,No,No,Yes,No


In [116]:
# substitute 'Yes' with 1 and 'No' with 0
# i will use the replace method of the dataframe to replace 'Yes' with 1 and 'No' with 0

data.replace({'Yes': 1, 'No': 0}, inplace = True)


In [117]:
data.head()

Unnamed: 0,Dataset,Source of Content,Target Issue,C1,C2,C3,C4,C5-a,C5-b,C5-c,...,C5-g,C5-h,C5-i,C6 - a,C6 - b,C6 - c,C6-d,C6-e,C7-a,C7-b
0,Chinese Multimodal Depression Corpus (CMDC),CI,DP,1.0,0.0,1.0,1.0,1,1,1,...,0,1,0.0,1,1,1,1,0,1,1
1,EMU,"CS, SR, SS","AX, DP",1.0,1.0,1.0,1.0,0,0,0,...,0,0,1.0,0,1,0,0,0,1,1
2,Moodable,"CS, SR",DP,1.0,1.0,0.0,0.0,0,0,0,...,0,0,1.0,0,0,0,0,0,0,0
3,Jiang et al.,CI,DP,0.0,0.0,0.0,1.0,1,1,1,...,0,0,0.0,0,0,1,0,0,1,0
4,Lin et al.,CI,"AX, DP",1.0,0.0,0.0,0.0,1,1,1,...,0,1,1.0,0,0,0,0,0,1,0


In [118]:
# replace 0 with NaN
# i will use the replace method of the dataframe to replace 0 with NaN
data.replace({0: pd.NA}, inplace = True)

data.count()

Dataset              36
Source of Content    36
Target Issue         36
C1                   20
C2                    9
C3                   10
C4                   21
C5-a                 29
C5-b                 25
C5-c                 23
C5-d                 12
C5-e                 24
C5-f                  2
C5-g                  2
C5-h                 13
C5-i                  7
C6 - a               12
C6 - b                8
C6 - c               17
C6-d                  8
C6-e                  1
C7-a                 28
C7-b                 17
dtype: int64

In [36]:
# sort the count
data.count().sort_values()

C6-e                  1
C5-f                  2
C5-g                  2
C5-i                  7
C6-d                  8
C6 - b                8
C2                    9
C3                   10
C5-d                 12
C6 - a               12
C5-h                 13
C7-b                 17
C6 - c               17
C1                   20
C4                   21
C5-c                 23
C5-e                 24
C5-b                 25
C7-a                 28
C5-a                 29
Target Issue         36
Source of Content    36
Dataset              36
dtype: int64

 # Target Issue Analysis

In [37]:

# Convert float values to strings
data['Target Issue'] = data['Target Issue'].astype(str)

# Split the comma-separated values into a list of strings
data['Target Issue'] = data['Target Issue'].str.split(',')

# Strip leading and trailing whitespaces from each value in the list
data['Target Issue'] = data['Target Issue'].apply(lambda x: [str(issue).strip() for issue in x])

# Explode the 'Target Issue' column into separate rows
data_expanded = data.explode('Target Issue')

data_expanded.head()


Unnamed: 0,Dataset,Source of Content,Target Issue,C1,C2,C3,C4,C5-a,C5-b,C5-c,...,C5-g,C5-h,C5-i,C6 - a,C6 - b,C6 - c,C6-d,C6-e,C7-a,C7-b
0,Chinese Multimodal Depression Corpus (CMDC),CI,DP,1.0,,1.0,1.0,1.0,1.0,1.0,...,,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0
1,EMU,"CS, SR, SS",AX,1.0,1.0,1.0,1.0,,,,...,,,1.0,,1.0,,,,1.0,1.0
1,EMU,"CS, SR, SS",DP,1.0,1.0,1.0,1.0,,,,...,,,1.0,,1.0,,,,1.0,1.0
2,Moodable,"CS, SR",DP,1.0,1.0,,,,,,...,,,1.0,,,,,,,
3,Jiang et al.,CI,DP,,,,1.0,1.0,1.0,1.0,...,,,,,,1.0,,,1.0,


In [38]:
# produce counts for columns from 3 to the end of the dataframe grouping by the 'Target Issue' column
data_expanded.iloc[:, 2:].groupby('Target Issue').count()

Unnamed: 0_level_0,C1,C2,C3,C4,C5-a,C5-b,C5-c,C5-d,C5-e,C5-f,C5-g,C5-h,C5-i,C6 - a,C6 - b,C6 - c,C6-d,C6-e,C7-a,C7-b
Target Issue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AL,3,2,0,3,5,4,4,3,2,1,0,1,1,2,2,2,0,1,3,3
AX,2,1,1,2,3,3,3,1,3,0,0,2,4,0,1,1,0,0,3,2
BD,2,1,1,3,4,2,2,1,2,0,0,1,0,1,0,4,2,0,2,1
DM,0,0,0,0,2,2,1,1,0,0,0,1,0,0,0,1,0,0,2,1
DP,11,4,7,9,14,11,9,4,14,1,0,7,6,4,6,8,6,0,13,8
P,2,0,0,3,3,3,3,1,3,0,2,1,0,2,0,2,0,0,3,2
S,1,1,2,2,0,2,3,2,3,0,0,1,0,2,0,0,0,0,4,2
SSD,1,1,0,1,1,1,1,0,1,0,0,1,0,1,0,0,0,0,1,0


In [39]:
# i want to produce not the count but the percentage with respect to the total number of rows that have that specific target issue
# so i will divide the counts by the total number of rows for each target issue
# first i will calculate the total number of rows for each target issue
total_rows = data_expanded['Target Issue'].value_counts()
total_rows

Target Issue
DP     17
AL      5
AX      4
BD      4
S       4
P       4
DM      2
SSD     1
Name: count, dtype: int64

In [52]:
# produce counts for columns from 3 to the end of the dataframe grouping by the 'Target Issue' column, transpose the table and add at the beginning the total number of rows for each target issue
proportions = data_expanded.iloc[:, 2:].groupby('Target Issue').count().T

# save the proportions as a csv file
proportions.to_csv('output_files/proportions.csv', sep=';')

In [7]:
import os
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator

import numpy as np

# Create a folder for saving figures if it doesn't exist
output_folder = 'output_figures'
os.makedirs(output_folder, exist_ok=True)

# Calculate the proportions
proportions = data_expanded.iloc[:, 2:].groupby('Target Issue').count().T

# Get unique Target Issues
unique_target_issues = data_expanded['Target Issue'].unique()

# Sort Target Issues by mean values in descending order
sorted_target_issues = proportions.mean().sort_values(ascending=False).index

# Define a colormap
colors = plt.cm.get_cmap('tab20', len(sorted_target_issues))

# Plot each 'Target Issue' separately
for i, target_issue in enumerate(sorted_target_issues):
    fig, ax = plt.subplots(figsize=(10, 5))

    # Select data for the specific 'Target Issue' and reverse the order
    target_issue_data = proportions[target_issue][::-1]

    # Plot the data with different colors and integer numbers on the x-axis
    #target_issue_data.plot(kind='barh', width=0.8, ax=ax, color=[colors(j) for j in range(len(target_issue_data))])
    target_issue_data.plot(kind='barh', width=0.8, ax=ax, color='black')

    # Set title and labels
    ax.set_title(f'Ethical Checklist Items Count for {target_issue}')
    ax.set_xlabel('Counts')
    ax.set_ylabel('Ethical Checklist Items')

    # Set integer numbers on the x-axis

    ax.xaxis.set_major_locator(MultipleLocator(1))

    # Save the figure in the 'output_figures' folder
    output_filename = os.path.join(output_folder, f'figure_{i + 1}_{target_issue}.png')
    plt.savefig(output_filename)

    plt.close()  # Close the figure to release resources


  colors = plt.cm.get_cmap('tab20', len(sorted_target_issues))


In [46]:
import os
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator

import numpy as np

# Create a folder for saving figures if it doesn't exist
output_folder = 'output_figures'
os.makedirs(output_folder, exist_ok=True)

# Calculate the proportions
proportions = data_expanded.iloc[:, 2:].groupby('Target Issue').count().T

# Get unique Target Issues
unique_target_issues = data_expanded['Target Issue'].unique()

# Sort Target Issues by mean values in descending order
sorted_target_issues = proportions.mean().sort_values(ascending=False).index

# Define a colormap
colors = plt.cm.get_cmap('tab20', len(sorted_target_issues))

# Plot each 'Target Issue' separately
for i, target_issue in enumerate(sorted_target_issues):
    fig, ax = plt.subplots(figsize=(10, 5))

    # Select data for the specific 'Target Issue' and reverse the order
    target_issue_data = proportions[target_issue][::-1]

    # Plot the data with different colors and integer numbers on the x-axis
    # target_issue_data.plot(kind='barh', width=0.8, ax=ax, color=[colors(j) for j in range(len(target_issue_data))])
    target_issue_data.plot(kind='barh', width=0.8, ax=ax, color='black')

    # Set title and labels
    ax.set_xlabel('Counts')
    ax.set_ylabel('Checklist Items')

    # Set integer numbers on the x-axis
    ax.xaxis.set_major_locator(MultipleLocator(1))

    # Set the maximum limit on the x-axis to the total number of items for this target issue
    # count in data number of rows for this target issue and set it as the maximum limit on the x-axis
    ax.set_xlim(0, data_expanded['Target Issue'].value_counts()[target_issue])

    # Save the figure in the 'output_figures' folder
    output_filename = os.path.join(output_folder, f'figure_{i + 1}_{target_issue}.png')
    plt.savefig(output_filename)

    plt.close()  # Close the figure to release resources


  colors = plt.cm.get_cmap('tab20', len(sorted_target_issues))


# Source Analysis

In [119]:
# Convert float values to strings
data['Source of Content'] = data['Source of Content'].astype(str)

# Split the comma-separated values into a list of strings
data['Source of Content'] = data['Source of Content'].str.split(',')

# Strip leading and trailing whitespaces from each value in the list
data['Source of Content'] = data['Source of Content'].apply(lambda x: [str(issue).strip() for issue in x])

# Explode the 'Target Issue' column into separate rows
data_expanded = data.explode('Source of Content')

data_expanded.head()

Unnamed: 0,Dataset,Source of Content,Target Issue,C1,C2,C3,C4,C5-a,C5-b,C5-c,...,C5-g,C5-h,C5-i,C6 - a,C6 - b,C6 - c,C6-d,C6-e,C7-a,C7-b
0,Chinese Multimodal Depression Corpus (CMDC),CI,DP,1.0,,1.0,1.0,1.0,1.0,1.0,...,,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0
1,EMU,CS,"AX, DP",1.0,1.0,1.0,1.0,,,,...,,,1.0,,1.0,,,,1.0,1.0
1,EMU,SR,"AX, DP",1.0,1.0,1.0,1.0,,,,...,,,1.0,,1.0,,,,1.0,1.0
1,EMU,SS,"AX, DP",1.0,1.0,1.0,1.0,,,,...,,,1.0,,1.0,,,,1.0,1.0
2,Moodable,CS,DP,1.0,1.0,,,,,,...,,,1.0,,,,,,,


In [120]:
data_expanded['Source of Content'].value_counts()

Source of Content
CI     17
ST      8
SR      7
SS      7
CS      5
YT      2
SFT     1
Name: count, dtype: int64

In [121]:
# drop 'Target Issue' column
data_expanded.drop(columns = 'Target Issue', inplace = True)
data_expanded.head()

Unnamed: 0,Dataset,Source of Content,C1,C2,C3,C4,C5-a,C5-b,C5-c,C5-d,...,C5-g,C5-h,C5-i,C6 - a,C6 - b,C6 - c,C6-d,C6-e,C7-a,C7-b
0,Chinese Multimodal Depression Corpus (CMDC),CI,1.0,,1.0,1.0,1.0,1.0,1.0,,...,,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0
1,EMU,CS,1.0,1.0,1.0,1.0,,,,,...,,,1.0,,1.0,,,,1.0,1.0
1,EMU,SR,1.0,1.0,1.0,1.0,,,,,...,,,1.0,,1.0,,,,1.0,1.0
1,EMU,SS,1.0,1.0,1.0,1.0,,,,,...,,,1.0,,1.0,,,,1.0,1.0
2,Moodable,CS,1.0,1.0,,,,,,,...,,,1.0,,,,,,,


In [122]:
data_expanded.iloc[:, 1:].groupby('Source of Content').count()


Unnamed: 0_level_0,C1,C2,C3,C4,C5-a,C5-b,C5-c,C5-d,C5-e,C5-f,C5-g,C5-h,C5-i,C6 - a,C6 - b,C6 - c,C6-d,C6-e,C7-a,C7-b
Source of Content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
CI,12,3,6,10,16,13,10,5,14,1,0,7,2,5,5,12,8,0,14,7
CS,3,3,2,3,3,3,2,2,3,1,0,2,5,0,2,1,0,0,3,2
SFT,1,0,0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,1,0
SR,6,3,2,6,5,5,5,3,3,1,1,2,3,4,3,4,1,1,6,6
SS,2,3,1,4,6,4,3,1,2,0,1,2,2,2,2,3,0,0,5,4
ST,3,2,2,4,4,6,6,3,5,0,1,4,0,4,0,2,0,0,8,4
YT,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0


In [123]:
# transpose the table and order by the frequency of the source of content
proportions = data_expanded.iloc[:, 1:].groupby('Source of Content').count().T
proportions.head()

Source of Content,CI,CS,SFT,SR,SS,ST,YT
C1,12,3,1,6,2,3,0
C2,3,3,0,3,3,2,0
C3,6,2,0,2,1,2,1
C4,10,3,1,6,4,4,0
C5-a,16,3,1,5,6,4,1


In [128]:
# add at the beginning a row '# datasets' with the total number of datasets for each source of content
proportions.loc['# datasets'] = data_expanded['Source of Content'].value_counts()
proportions.head(30)

Source of Content,CI,CS,SFT,SR,SS,ST,YT
C1,12,3,1,6,2,3,0
C2,3,3,0,3,3,2,0
C3,6,2,0,2,1,2,1
C4,10,3,1,6,4,4,0
C5-a,16,3,1,5,6,4,1
C5-b,13,3,1,5,4,6,0
C5-c,10,2,1,5,3,6,1
C5-d,5,2,1,3,1,3,0
C5-e,14,3,1,3,2,5,1
C5-f,1,1,0,1,0,0,0


In [125]:
# save the proportions as a csv file
proportions.to_csv('output_files/proportions.csv', sep=';')