# README

### Purpose of this notebook
- Create dataframe for committee comment.

### Steps
1. Concatenate all raw csv files into one csv file.
2. Extract columns and create dataframe from the merged data sheet.
3. Simple EDA and perform null value check in the dataframe.
    - Convert 6 level grade (ABCDEF) to 4 level grade (ABCF)
    - Handle outliers

For preprocessing the comments, go to ` comment_preprocess_split_sentences` and ` comment_preprocess_tokenization` notebooks.

# Import Library

In [None]:
import pandas as pd
import numpy as np

from importlib import reload
from pprint import pprint

# Utility variable
import sys
sys.path.insert(0, '../..')

# var
import var.path as P
import var.var as V

# utils
import utils.data as D

## Merge data from different year

In [None]:
csvs = []

for fp in P.FP_COMMENT_CSV:
    try:
        csv = pd.read_csv(fp)
        csvs.append(csv)
    except:
        pass
    
df = pd.concat(csvs)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.to_csv(P.FP_ALL_COMMENT_CSV, index=False)

# Read all data and preprocess

### Utilities

In [None]:
df = pd.read_csv(P.FP_ALL_COMMENT_CSV)
df.shape

In [None]:
for col in df:
    print(df[col].isna().value_counts())
    print('-'*50)

In [None]:
df.columns

In [None]:
col_year = "# The content is removed due to confidential concerns."
col_id = "# The content is removed due to confidential concerns."
col_group = "# The content is removed due to confidential concerns."

fstr_committee_member_grade = "# The content is removed due to confidential concerns."
fstr_committee_member_original_grade = "# The content is removed due to confidential concerns."
fstr_committee_member_score = "# The content is removed due to confidential concerns."
fstr_committee_member_original_score = "# The content is removed due to confidential concerns."
fstr_committee_member_comment = "# The content is removed due to confidential concerns."

## Create dataframe with committee member's grade and comment

In [None]:
df_grade_comment_data_list = []

"""
First, create a list of dictionary (row data).
Then, create the dataframe with the list of data.
NEVER append row data inside a for loop, which is a common cause of the poor performance.
(At each iteration, a new dataframe would be created. What a waste!)
"""

for idx, row in df.iterrows():
    for member_num in range(1, V.MAX_NUM_COMMITTEE_MEMBER+1):
        col_committee_member_grade = fstr_committee_member_grade.format(member_num)
        col_committee_member_original_grade = fstr_committee_member_original_grade.format(member_num)
        col_committee_member_score = fstr_committee_member_score.format(member_num)
        col_committee_member_original_score = fstr_committee_member_original_score.format(member_num)
        col_committee_member_comment = fstr_committee_member_comment.format(member_num)
        
        df_rc_row_data = {
            'year': row[col_year],
            'id': row[col_id],
            'group': row[col_group],
            'committee_number': member_num,
            'grade': row[col_committee_member_grade],
            'original_grade': row[col_committee_member_original_grade],
            'score': row[col_committee_member_score],
            'original_score': row[col_committee_member_original_score],
            'comment': row[col_committee_member_comment]
        }
        
        if pd.isna(df_rc_row_data['grade']):
            continue
        
        df_grade_comment_data_list.append(df_rc_row_data)
                        
df_comments = pd.DataFrame(df_grade_comment_data_list)

In [None]:
df_comments.comment.fillna(0, inplace=True)
df_comments.id = df_comments.id.astype('int64')

In [None]:
df_comments.head()

### Check for null value in each column

In [None]:
for col in df_comments:
    print(df_comments[col].isna().value_counts())
    print('-'*50)

### Deal with group outliers

In [None]:
df_comments.group.value_counts()

In [None]:
group_outliers = ["# The content is removed due to confidential concerns."]

In [None]:
for o in group_outliers:
    pprint(df_comments[df_comments.group == o])
    print("---------------")

In [None]:
group_outliers = ["# The content is removed due to confidential concerns."]
group_outlier_dict = {
    "# The content is removed due to confidential concerns."
}

In [None]:
df_comments['group'] = df_comments.group.apply(
    lambda g: group_outlier_dict[g] if g in group_outliers else g
)

### Count the number of students in each group

The content is removed due to confidential concerns.

In [None]:
df_comments.group.value_counts()

### Merge two different grade system
The content is removed due to confidential concerns.

In [None]:
grading_system_mapping = {
    "# The content is removed due to confidential concerns."
}

In [None]:
def map_grade_systems(row, column):
    _year = row['year']
    _grade = row[column]
    
    if _year <= 108:
        try:
            return grading_system_mapping[_grade]
        except:
            ## return nan
            return _grade
    else:
        return _grade

In [None]:
df_comments['grade'] = df_comments.apply(lambda row: map_grade_systems(row, 'grade'), axis=1)
df_comments['original_grade'] = df_comments.apply(lambda row: map_grade_systems(row, 'original_grade'), axis=1)

In [None]:
df_comments['grade'].value_counts()

In [None]:
df_comments['original_grade'].value_counts()

### Fill in values with null score

In [None]:
df_comments['score'] = df_comments.apply(
    lambda r: V.NULL_GRADE_FILL[r['grade']] if np.isnan(r['score']) else r['score'],
    axis=1
)

### Sort the dataframe and write the file

In [None]:
df_comments.sort_values(by=['year', 'id', 'committee_number'], inplace=True)
df_comments.reset_index(drop=True, inplace=True)
df_comments.head()

In [None]:
df_comments.shape

In [None]:
D.write_df_comments(df_comments, file='csv')
D.write_df_comments(df_comments, file='pkl')