# README

### Purpose of this notebook
- Perform EDA on comment data and applicant data.

### Steps
1. Read the comment data and applicant data.
2. Data preprocess
    - Calculate the length of the comments.
    - Aggregate talent field for applicants.
3. Generate word cloud for each talent.
4. Plot grade distribution for each year.
5. Calculate standard deviation by different grouping method.
6. Draw distribution of the grade difference

# Import Library

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from wordcloud import WordCloud

from importlib import reload

import os
import math

# Utility variable
import sys
sys.path.insert(0, '../..')

# var
import var.path as P
import var.var as V

# utils
import utils.data as D
import utils.preprocess as PP

In [None]:
## Set up chinese font for matplotlib
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Noto Sans CJK TC']  
plt.rcParams['axes.unicode_minus'] = False

## Read Latest Dataframe

In [None]:
df_comments = D.read_df_comments()
df_comments.head()

In [None]:
df_tokenized_comments = D.read_df_tokenized_comments()
df_tokenized_comments.head()

In [None]:
df_applicants = D.read_df_applicants()
df_applicants.head()

## Calculate the length of the comment

In [None]:
df_comments['comment'] = df_comments['comment'].apply(
    lambda s: str(s)
)

In [None]:
# length of application
df_comments['comment_length'] = df_comments['comment'].apply(
    lambda s: len(s) if not PP.is_empty_sent(s) else np.NaN
)

In [None]:
df_comments['comment_length'].describe()

# EDA

## Use talent and field to group students

### Group by year

In [None]:
g_applicants_year = df_applicants.groupby('year')

In [None]:
for year, yg in g_applicants_year:
    print("Year {}: ".format(year))
    g_applicants_year_group = yg.groupby('group')
    
    for group, ygg in g_applicants_year_group:
        print("Year {} Group {} Summarized Talent Distribution: ".format(year, group))
        print(ygg.summarized_talent.value_counts())
        print("-"*50)

### Map talent to predefined field
- Only focus on summarized talent label after year 109

In [None]:
def summarized_talent_matching(s_talent):
    field = ""
    for talent in V.TALENT_LIST:
        if talent in s_talent:
            field = V.TALENT_TO_FIELD_MAPPING[talent]
            break
    
    if field == "":
        print(s_talent)
    
    return field

In [None]:
df_applicants["talent_field"] = df_applicants.summarized_talent.apply(summarized_talent_matching)

In [None]:
(df_applicants["talent_field"] == "").value_counts()

In [None]:
D.write_df_applicants(df_applicants, file="csv")
D.write_df_applicants(df_applicants, file="pkl")

## Generate word cloud for comments in each field

### Filter applicants
- Focus on data from year 109 - 111

In [None]:
applicants_year_109_to_111_filter = (df_applicants['year'] >= 109)
df_109_to_111_applicants = df_applicants[applicants_year_109_to_111_filter]
df_109_to_111_applicants.shape

In [None]:
df_109_to_111_applicants.talent_field.value_counts()

In [None]:
df_tokenized_comments = pd.merge(
    df_tokenized_comments, df_applicants[['year', 'id', 'group', 'talent_field']], 
    how='inner', on=['year', 'id', 'group'], validate="many_to_one"
)

In [None]:
comments_year_109_to_111_filter = (df_tokenized_comments['year'] >= 109)
df_109_to_111_tokenized_comments = df_tokenized_comments[comments_year_109_to_111_filter]
df_109_to_111_tokenized_comments.shape

In [None]:
g_109_to_111_tokenized_comments = df_109_to_111_tokenized_comments.groupby("talent_field")

### Calculate word count

In [None]:
from collections import Counter
from wordcloud import WordCloud

In [None]:
POS_FILTER = [
    'ENTITY',
    'ACTION',
    'MODIFIER',
    'KNOWLEDGE',
    'Verb'
]

In [None]:
field_word_count_dict = {}

for field, gdf in g_109_to_111_tokenized_comments:
    cnt = Counter()
    
    for idx, row in gdf.iterrows():
        token_list = row['ckip_comment_ws']
        pos_list = row['ckip_comment_pos']
        
        for token, pos in zip(token_list, pos_list):
            if not PP.is_empty_sent(token) and pos in POS_FILTER:
                cnt[token] += 1
    
    field_word_count_dict[field] = cnt

### Generate word cloud from word count

In [None]:
for field in field_word_count_dict:
    print(field)
    word_count = field_word_count_dict[field]
    wordcloud = WordCloud(
        width=1600,
        height=1200,
#         max_font_size=60,
        font_path=V.FONT_PATH
    ).generate_from_frequencies(word_count)
    
    plt.figure(figsize=(20,15))
#     plt.title("{} Comment Word Cloud from Year 109 to 111".format(field))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
#     plt.show()
    
    ## save image
    folder_name = os.path.join(P.FP_EDA_QUALITATIVE_DIR, 'comment_word_cloud_by_field')
    fig_name = "{}_comment_word_cloud_from_109_to_111_.png".format(field)
    plt.savefig(os.path.join(folder_name, fig_name))

## Draw grade distribution

In [None]:
def draw_grade_distribution(df, year, folder_name=None, fig_name=None):
    grade_distribution = df['grade'].value_counts()
    grade_distribution.sort_index(inplace=True)

    labels = V.GRADE_SYMBOLS # ['A', 'B', 'C', 'F']
    values = [grade_distribution[s] for s in V.GRADE_SYMBOLS]
    x = np.arange(len(labels))  # the label locations
    width = 0.7  # the width of the bars

    fig, ax = plt.subplots(figsize=(8, 6))
    rects = ax.bar(x, values, width)

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_xlabel('Grade')
    ax.set_ylabel('Comment Count')
    ax.set_title('Year {} Grade Distribution'.format(year))
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    fig.tight_layout()
    
    if folder_name and fig_name:
        plt.savefig(os.path.join(folder_name, fig_name))

In [None]:
for year, yg in g_comments_year:
    fig_name = '{}_grade_dist.png'.format(year)
    
    draw_grade_distribution(
        yg, 
        year,
        folder_name=os.path.join(P.FP_EDA_QUANTITATIVE_DIR, 'grade_distribution'),
        fig_name=fig_name
#     plt.show()
    )

## Standard deviation within the same group of students in the same year

#### Experiment 1
- Control Variables: year, group
- Independent Variables: quartile
- Dependent Variables:

In [None]:
def calculate_groups_avg_score_std_by_quartile(df):
    """
        return: 2d array with shape (num_groups, 4),
        representing the average standard deviation value w.r.t score.
    """
    groups = df.groupby('group')
    ## std mean for each quartile in each group
    groups_avg_score_std_by_quartile = []
    
    ## Calculate std mean for each quartile in each group
    for group_label, group in groups:
        id_groups = group.groupby('id')
        
        score_mean = id_groups['score'].mean().rename("mean")
        score_std = id_groups['score'].std().rename("std")
        id_score_stats = pd.merge(score_mean, score_std, left_index=True, right_index=True)
        
        quartiles = score_mean.quantile([.25, .5, .75]).tolist()
        quartiles = [0] + quartiles + [float('inf')]
        
        score_quartile_group = pd.cut(
            score_mean,
            bins=quartiles,
            labels=['Q1', 'Q2', 'Q3', 'Q4']
        ).rename("quartile_group")

        id_score_stats = pd.merge(id_score_stats, score_quartile_group, left_index=True, right_index=True)
        group_avg_score_std = id_score_stats.groupby('quartile_group')['std'].mean().tolist()
        groups_avg_score_std_by_quartile.append(group_avg_score_std)
    
    return np.array(groups_avg_score_std_by_quartile)

In [None]:
def draw_avg_score_std_by_quartile(df, year, folder_name=None, fig_name=None):
    groups_avg_score_std_by_quartile = calculate_groups_avg_score_std_by_quartile(df)

    ## Plot the values
    # independent variables
    quartile_labels = ['Q1', 'Q2', 'Q3', 'Q4']
    x = np.arange(len(quartile_labels)) * 2  # the label locations
    
    # control variables
    group_labels = V.GROUP_LABELS[year]
    n_rects = len(group_labels)
    
    fig, ax = plt.subplots(figsize=(8, 6))
    
    ## the width of the bars
    width = 0.3
    w = (-n_rects + 1) * width / 2
    
    ## add a set of bars for each group
    for i, values in enumerate(groups_avg_score_std_by_quartile):
        rects = ax.bar(x + w, values, width, label=group_labels[i])
        w = w + width

    ## Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_xlabel('Quartile')
    ax.set_ylabel('Average Standard Deviation w.r.t Score')
    ax.set_title('Year {} Average Standard Deviation w.r.t Score in each quartile'.format(year))
    ax.set_xticks(x)
    ax.set_xticklabels(quartile_labels)
    ax.set_ylim([0, 10])
    ax.legend()
    fig.tight_layout()
    
    if folder_name and fig_name:
        plt.savefig(os.path.join(folder_name, fig_name))

In [None]:
for year, yg in g_comments_year:
    fig_name = '{}_avg_score_std_by_quartile.png'.format(year)
    
    draw_avg_score_std_by_quartile(
        yg, 
        year,
        folder_name=os.path.join(P.FP_EDA_QUANTITATIVE_DIR, 'avg_score_std_by_quartile'),
        fig_name=fig_name
    )

#### Experiment 2
- Control Variables: year, quartile
- Independent Variables: group
- Dependent Variables:

In [None]:
def draw_avg_score_std_by_group(df, year, folder_name=None, fig_name=None):
    groups_avg_score_std_by_quartile = calculate_groups_avg_score_std_by_quartile(df)
    groups_avg_score_std_by_group = np.transpose(groups_avg_score_std_by_quartile)
    
    ## Plot the values
    # independent variables
    group_labels = V.GROUP_LABELS[year]
    x = np.arange(len(group_labels)) * 2  # the label locations
    
    # control variables
    quartile_labels = ['Q1', 'Q2', 'Q3', 'Q4']
    n_rects = len(quartile_labels)
    
    fig, ax = plt.subplots(figsize=(8, 6))
        
    ## the width of the bars
    width = 0.45
    w = (-n_rects + 1) * width / 2
    
    ## add a set of bars for each group
    for i, values in enumerate(groups_avg_score_std_by_group):
        rects = ax.bar(x + w, values, width, label=quartile_labels[i])
        w = w + width

    ## Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_xlabel('Group')
    ax.set_ylabel('Average Standard Deviation w.r.t Score')
    ax.set_title('Year {} Average Standard Deviation w.r.t Score in each group'.format(year))
    ax.set_xticks(x)
    ax.set_xticklabels(group_labels)
    ax.set_ylim([0, 10])
    ax.legend()
    fig.tight_layout()
    
    if folder_name and fig_name:
        plt.savefig(os.path.join(folder_name, fig_name))

In [None]:
for year, yg in g_comments_year:
    fig_name = '{}_avg_score_std_by_group.png'.format(year)
    
    draw_avg_score_std_by_group(
        yg, 
        year,
        folder_name=os.path.join(P.FP_EDA_QUANTITATIVE_DIR, 'avg_score_std_by_group'),
        fig_name=fig_name
    )

#### Experiment 3
- Control Variables: 
- Independent Variables: committee's scoring
- Dependent Variables:

In [None]:
df_comments['score'] = df_comments.apply(
    lambda r: V.NULL_GRADE_FILL[r['grade']] if np.isnan(r['score']) else r['score'],
    axis=1
)

In [None]:
def calculate_committee_scoring_difference(df):
    """
        
    """
    groups = df.groupby('group')
    ## std mean for each quartile in each group
    groups_committee_scoring_diff = []
    
    ## Calculate std mean for each quartile in each group
    for group_label, group in groups:
#         print(group_label)
        id_groups = group.groupby('id')
#         for _id, g in id_groups:
#             print(_id)
#             print(g.shape)
#             print(g)
        committee_groups = group.groupby('committee_number')
        
#         print(id_groups['score'])
        
        group_committee_scoring_diff = []
        for committee_label, committee_group in committee_groups:
#             print(committee_label)
#             print(committee_group['score'])
            score_committee = committee_group['score'].to_numpy()
            score_mean = id_groups['score'].mean().to_numpy()
            
            committee_scoring_diff = np.linalg.norm(score_committee - score_mean)
            group_committee_scoring_diff.append(committee_scoring_diff)
        
        groups_committee_scoring_diff.append(group_committee_scoring_diff)
    
    return groups_committee_scoring_diff

In [None]:
def draw_committee_scoring_difference(df, year, folder_name=None, fig_name=None):
    groups_committee_scoring_difference = calculate_committee_scoring_difference(df)
    group_labels = V.GROUP_LABELS[year]
    
#     print(year)
#     print(groups_committee_scoring_difference)
    
    for i, group_committee_scoring_difference in enumerate(groups_committee_scoring_difference):
        ## Plot the values
        # independent variables
        n_committee = len(group_committee_scoring_difference)
        x = np.arange(n_committee) + 1  # the label locations
        values = group_committee_scoring_difference
        
        fig, ax = plt.subplots(figsize=(8, 6))
        
        ## the width of the bars
        width = 0.45
        rects = ax.bar(x, values, width)

        ## Add some text for labels, title and custom x-axis tick labels, etc.
        ax.set_xlabel('Committee Member Number')
        ax.set_ylabel('Scoring Difference')
        ax.set_title('Year {} Group {} Committee Scoring Difference'.format(year, group_labels[i]))
        ax.set_xticks(x)
#         ax.set_ylim([0, 150])
        fig.tight_layout()
    
        if folder_name and fig_name:
            plt.savefig(os.path.join(folder_name, fig_name.format(year, group_labels[i])))

In [None]:
for year, yg in g_comments_year:
    if year < 111:
        continue
        
    fig_name = '{}_{}_committee_scoring_difference.png'
    
    draw_committee_scoring_difference(
        yg, 
        year,
        folder_name=os.path.join(P.FP_EDA_QUANTITATIVE_DIR, 'committee_scoring_difference'),
        fig_name=fig_name
    )

#### Experiment 4
- Control Variables:
- Independent Variables:
- Dependent Variables:

In [None]:
def calculate_groups_score_std(df):
    """
        
    """
    groups = df.groupby('group')
    ## std mean for each quartile in each group
    groups_score_std = []
    
    ## Calculate std mean for each quartile in each group
    for group_label, group in groups:
        id_groups = group.groupby('id')
        
        score_std = id_groups['score'].std().rename("std")
        groups_score_std.append(score_std)
    
    return groups_score_std

In [None]:
def draw_groups_score_std_bin(df, year, folder_name=None, fig_name=None):
    groups_score_std = calculate_groups_score_std(df)
    group_labels = V.GROUP_LABELS[year]
    
    rows = 2
    cols = 3
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(18, 12))
        
    for i, group_score_std in enumerate(groups_score_std):
        ax = axes[math.floor(i / cols), i % cols]
        
        ## Plot the values
        # independent variables
        x = group_score_std  # the label locations
        bins = [0, 2, 4, 6, 8, 10, 12]
        
#         ## the width of the bars
        ax.hist(x, bins)

#         ## Add some text for labels, title and custom x-axis tick labels, etc.
        ax.set_xlabel('Score Standard Deviation')
        ax.set_ylabel('Bin Count')
        ax.set_title('Year {} Group {} Committee Scoring Difference'.format(year, group_labels[i]))
    
    if folder_name and fig_name:
        plt.savefig(os.path.join(folder_name, fig_name))

In [None]:
for year, yg in g_comments_year:
    fig_name = '{}_groups_score_std_bin.png'.format(year)
    
    draw_groups_score_std_bin(
        yg, 
        year,
        folder_name=os.path.join(P.FP_EDA_QUANTITATIVE_DIR, 'groups_score_std_bin'),
        fig_name=fig_name
    )

## Draw distribution of the grade difference

In [None]:
def calculate_grade_diff(grade, original_grade):
    return V.GRADE_SYMBOLS_NUM_BIN[original_grade] - V.GRADE_SYMBOLS_NUM_BIN[grade]

In [None]:
df_comments['grade_diff'] = df_comments.apply(lambda x: calculate_grade_diff(x.grade, x.original_grade), axis=1)

In [None]:
## exclude nan
grade_diff_distribution = df_comments.grade_diff.value_counts()
grade_diff_distribution.sort_index(inplace=True)
_COUNT = grade_diff_distribution.sum()
grade_diff_distribution_persent = grade_diff_distribution / _COUNT * 100
grade_diff_distribution_persent

In [None]:
## include nan
grade_diff_distribution['nan'] = df_comments.grade_diff.isna().sum()
grade_diff_distribution_persent = grade_diff_distribution / COUNT * 100
grade_diff_distribution_persent