# README

### Purpose of this notebook
- Create dataframe for the recommendation letters.

### Steps
1. Concatenate all csv files into one csv file.
2. Extract columns and create dataframe from the merged csv file.
3. Simple EDA and perform null value check in the dataframe.

For preprocessing the recommendation letters, go to `recommendation_letter_preprocess` notebook.

# Import Library

In [None]:
import pandas as pd
from importlib import reload
from collections import defaultdict

# Utility variable
import sys
sys.path.insert(0, '../..')

## var
import var.path as P
import var.var as V

## utils
import utils.data as D

## Read Lateset Dataframe

## Merge data from different year

In [None]:
csvs = []

for fp in P.FP_RECOMMENDATION_LETTER_CSV:
    try:
        year = int(fp.split('/')[-1][:3])
        csv = pd.read_csv(fp)
        csv['年份'] = year
        csvs.append(csv)
    except:
        pass
    
df = pd.concat(csvs)

In [None]:
df.shape

In [None]:
df.columns.to_list()

### Check duplicate

In [None]:
"# The content is removed due to confidential concerns."

# Read raw data and preprocess

### Utilities

In [None]:
df = pd.read_csv(P.FP_ALL_RECOMMENDATION_LETTER_CSV)
df.shape

In [None]:
for col in df:
    print(df[col].isna().value_counts())
    print('-'*50)

In [None]:
df.columns.to_list()

## Create achievement dataframe

In [None]:
col_year = "# The content is removed due to confidential concerns."
col_id = "# The content is removed due to confidential concerns."

col_talent_or_achievement = "# The content is removed due to confidential concerns."
col_talent_or_achievement_reason = "# The content is removed due to confidential concerns."
col_learning_attitude ="# The content is removed due to confidential concerns."
col_strength = "# The content is removed due to confidential concerns."
col_weakness = "# The content is removed due to confidential concerns."
col_recommend_reason = "# The content is removed due to confidential concerns."

In [None]:
df_recommendation_letter_list = []

"""
First, create a list of dictionary (row data).
Then, create the dataframe with the list of data.
NEVER append row data inside a for loop, which is a common cause of the poor performance.
(At each iteration, a new dataframe would be created. What a waste!)
"""

recommendation_letter_counter = defaultdict(int)

for _, row in df.iterrows():
    _year = row[col_year]
    _id = row[col_id]
    idx = (_year, _id)
    recommendation_letter_counter[idx] += 1
    
    df_ds_row_data = {
        'year': _year,
        'id': _id,
        'letter_num': recommendation_letter_counter[idx],
        'talent_or_achievement': row[col_talent_or_achievement],
        'talent_or_achievement_reason': row[col_talent_or_achievement_reason],
        'learning_attitude': row[col_learning_attitude],
        'strength': row[col_strength],
        'weakness': row[col_weakness],
        'recommend_reason': row[col_recommend_reason],
    }
    
    df_recommendation_letter_list.append(df_ds_row_data)
                        
df_recommendation_letters = pd.DataFrame(df_recommendation_letter_list)

In [None]:
df_recommendation_letters

### Check for null value in each column

In [None]:
for col in df_recommendation_letters:
    print(df_recommendation_letters[col].isna().value_counts())
    print('-'*50)

In [None]:
df_recommendation_letters.fillna("", inplace=True)

In [None]:
df_recommendation_letters.sort_values(by=['year', 'id', 'letter_num'], inplace=True)

In [None]:
for col in df_recommendation_letters:
    print(df_recommendation_letters[col].isna().value_counts())
    print('-'*50)

In [None]:
df_recommendation_letters.year.value_counts()

In [None]:
df_recommendation_letters.letter_num.value_counts()

In [None]:
D.write_df_recommendation_letters(df_recommendation_letters, file='csv')
D.write_df_recommendation_letters(df_recommendation_letters, file='pkl')