# UM Data Exploration

In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

In [None]:
DATASET_FILE = '../data/processed/um.csv'
subscore_cols = ['evidence','suggestion','connection']

In [None]:
# Load the file
df = (pd.read_csv(DATASET_FILE)
        .rename({'Unnamed: 0': 'file_index'}, axis=1))

# QuAL

## Exploration

The number of items for each QuAL level, including NaN QuALs

In [None]:
df['qual'].value_counts(dropna=False).sort_index()

Descriptive statistics

In [None]:
df['qual'].describe().reset_index().T

Histogram

In [None]:
plt.figure(figsize=(5,3))
df['qual'].value_counts().sort_index().plot.bar()

Descriptives by clerkship

In [None]:
df.groupby('clerkship')['qual'].describe()

Average score by clerkship

In [None]:
plt.figure(figsize=(5,3))
sns.barplot(data=df, x='clerkship', hue='clerkship', y='qual')
_ = plt.xticks(rotation=35)

## Model Performance

In [None]:
# Make a copy of the dataframe for analysis
dfa = df.copy()
target_col = 'qual' # target
text_col = 'text' # text
print('DF original shape:', dfa.shape)
dfa = dfa.dropna(subset=[target_col, text_col])
print('DF new shape:', dfa.shape)

white_space_only = dfa[text_col].str.strip().str.len() == 0
white_space_replace = 'blank'
print(f'There are {white_space_only.sum()} ({white_space_only.sum()/len(white_space_only)*100:.2f}%) whitespace-only texts - replacing with "{white_space_replace}"')
dfa.loc[white_space_only, text_col] = white_space_replace

dfa = dfa.reset_index(drop=True)

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('maxspad/nlp-qual-qual', model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained('maxspad/nlp-qual-qual')
pipe = pipeline(task='text-classification', model=model, tokenizer=tokenizer, device='mps')

In [None]:
res = pipe(dfa[text_col].tolist(), batch_size=16, truncation=True)

In [None]:
dfr = (pd.concat([dfa, pd.json_normalize(res)], axis=1)
         .replace({ 'label': {
            'LABEL_5':5,
            'LABEL_4':4,
            'LABEL_3':3,
            'LABEL_2':2,
            'LABEL_1':1,
            'LABEL_0':0             
         }})
         .rename({
             'label': 'qual_pred_label',
             'score': 'qual_pred_score'
         }, axis=1))

In [None]:
blah = dfr[['text','qual','qual_pred_label','qual_pred_score']]

In [None]:
(blah['qual'] == blah['qual_pred_label']).sum() / len(blah)

# Subscores

Number of subscores available

In [None]:
df[['evidence','suggestion','connection']].notna().sum()

Clerkships which rated their subscores

In [None]:
df.groupby('clerkship')[['evidence','suggestion','connection']].agg(lambda x: x.notna().sum())

## Q1 - Evidence

Distribution of Evidence (Q1) subscore

In [None]:
c = 'evidence'

In [None]:
df[c].value_counts().sort_index()

In [None]:
df[c].describe().reset_index().T

In [None]:
plt.figure(figsize=(5,3))
sns.histplot(df[c])

In [None]:
df.groupby('clerkship')[c].describe().dropna()

In [None]:
plt.figure(figsize=(5,3))
sns.barplot(data=df.dropna(subset=c), y=c, x='clerkship', hue='clerkship')

## Suggestion (Q2)

Distribution of Suggestion (Q2) subscore

In [None]:
c = 'suggestion'

In [None]:
df[c].value_counts().sort_index()

In [None]:
df[c].describe().reset_index().T

In [None]:
plt.figure(figsize=(5,3))
sns.histplot(df[c])

In [None]:
df.groupby('clerkship')[c].describe().dropna()

In [None]:
plt.figure(figsize=(5,3))
sns.barplot(data=df.dropna(subset=c), y=c, x='clerkship', hue='clerkship')

## Connection (Q3)

Distribution of Connection (Q3) subscore

In [None]:
c = 'connection'

In [None]:
df[c].value_counts().sort_index()

In [None]:
df[c].describe().reset_index().T

In [None]:
plt.figure(figsize=(5,3))
sns.histplot(df[c])

In [None]:
df.groupby('clerkship')[c].describe().dropna()

In [None]:
plt.figure(figsize=(5,3))
sns.barplot(data=df.dropna(subset=c), y=c, x='clerkship', hue='clerkship')