# Feature Engineering : Condensed Features

### dependency

In [18]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import datetime
from utils import *

### Load Data

In [76]:
train_filepath = 'data/agg_train.csv'
train_df = pd.read_table(train_filepath)

  


In [77]:
test_filepath = 'data/test.csv'
test_df = pd.read_table(test_filepath)

  


## Temporal Features

### generate date for each row

In [11]:
#transfer string to date and time
def date_trans(str_time):
    if type(str_time) != str:
        return datetime.datetime(2000, 1, 1)
    format = "%Y-%m-%d %H:%M:%S.0"
    return datetime.datetime.strptime(str_time,format)

# create series of pandas date and time
def trans_date_time_series(data):
    cat = "Step Start Time"
    str_chosen_time = list(data[cat])
    date_time = [date_trans(x) for x in str_chosen_time]
    date = [x.date() for x in date_time]
    datetime_s = pd.to_datetime(date_time)
    date_s = pd.to_datetime(date)
    return (date_s,datetime_s)
  

### helper functions

In [12]:
# to wash Knowledge Component
def wash_KC(x):
    if type(x) != str:
        x = 'Null'
    return x


def split_unit(row):
    return row["Problem Hierarchy"].split(',')[0].strip()


### find related Knowlegde Component in History

In [159]:
# generate a dataframe for the relations of Sid, KC and Time
def gen_history_df(data):
    date,date_time = trans_date_time_series(data)

    data['Date and Time'] = date_time
    data['Date'] = date
    return data.groupby(['Anon Student Id','KC(Default)','Date']).size()
    


def get_history_range(row,df,rg):
    t = row["Row"]
    Sid = row["Anon Student Id"]
    KC = row["KC(Default)"]
    Date = row["Date"].date()
    td = pd.Timedelta(rg)
    dates = []
    while td >= pd.Timedelta(0):
        dates.append(Date-td)
        td -= pd.Timedelta('1 days')
    try:
        return df.loc[Sid].loc[KC].loc[dates].sum()
    except KeyError:
        return 0
    
# allocate temporal features to test data
def find_temporal(row,train_df,kc_his):
#     return 0
    unit = row['Unit']
    sid = row['Anon Student Id']
    same_unit = train_df.loc[train_df['Unit']==unit].loc[train_df['Anon Student Id'] == sid]
    print(type(same_unit))
    if same_unit.shape[0] == 0:
        return 0

    res =  same_unit.iloc[-1][kc_his]
    return res


# Add temporal features to train    
def train_temp(train):
    #wash data
    traindata['KC(Default)'] = traindata['KC(Default)'].apply(wash_KC)
    #a table to look up to history kc
    historytable = gen_history_df(train)
    
    today = "0 days"
    train['KC History Today'] = train.apply(get_history_range,args=(historytable,today,),axis = 1)
    
    yestr = "1 days"
    train['KC History Yesterday'] = train.apply(get_history_range,args=(historytable,yestr,),axis = 1)\
        - train['KC History Today']
    week = "7 days"
    train['KC History Week'] = train.apply(get_history_range,args=(historytable,week,),axis = 1) \
        - train['KC History Today']
    
    return train

# Add temporal features to test data
def test_temp(train,test):
    test['KC(Default)'] = test['KC(Default)'].apply(wash_KC)
    for kch_name in ['KC History Today','KC History Yesterday','KC History Week']:
        print(kch_name)
        test[kch_name] = test.apply(find_temporal,args=(train,kch_name),axis=1)

    
    return test
    
# main funcion to generate temporal features
def temp_main(train,test):
    train_df["Unit"] = train_df.apply(split_unit,axis=1)
    test_df['Unit'] = test_df.apply(split_unit,axis=1)
    
    train = train_temp(train)
    test = test_temp(train,test)
    return train,test

## Correct First Attempt Rate

In [122]:

def gen_count_table(train,requested_list):
    requested_list.append('Correct First Attempt')
    cfa_cnt = train[requested_list].groupby(requested_list).size()
    return cfa_cnt

def cfa_for_test_2(row,table,request_2):
    key1 = row[request_2[0]]
    key2 = row[request_2[1]]
    t = row["Row"]
    try:
        c = table.loc[key1].loc[key2].loc[1]
    except KeyError:
        return 0
    try:
        ic = table.loc[key1].loc[key2].loc[0]
    except KeyError:
        ic = 0 
    return c / (c + ic)

def cfa_for_train_2(row,table,request_2):
    key1 = row[request_2[0]]
    key2 = row[request_2[1]]
    t = row["Row"]
#     print(t)
    if 1 in table.loc[key1,key2].index:
        c = table.loc[key1,key2,1]
    else:
        return 0
    
    if 0 in table.loc[key1,key2].index:
        ic = table.loc[key1,key2,0]
    else:
        ic = 0
    return c / (c + ic)

def cfa_for_test_1(row,table,request):
    key1 = row[request[0]]
    try:
        c = table.loc[key1].loc[1]
    except KeyError:
        return 0
    try:
        ic = table.loc[key1].loc[0]
    except KeyError:
        ic = 0 
    return c / (c + ic)

def cfa_for_train_1(row,table,request):
    key1 = row[request[0]]
#     print(key1)
    t = row["Row"]
#     print(t)
    if 1 in table.loc[key1].index:
        c = table.loc[key1,1]
    else:
        return 0
    
    if 0 in table.loc[key1].index:
        ic = table.loc[key1,0]
    else:
        ic = 0
    return c / (c + ic)


def get_cfa_both(train,test,request):
#     key should be a list

    cfa_table = gen_count_table(train,request)
#     print(cfa_table.index)
    if len(request) == 2:
        train_s = train.apply(cfa_for_train_1,args= (cfa_table,request),axis=1)
        test_s = test.apply(cfa_for_test_1,args=(cfa_table,request),axis=1)
    else:
        train_s = train.apply(cfa_for_train_2,args= (cfa_table,request),axis=1)
        test_s = test.apply(cfa_for_test_2,args=(cfa_table,request),axis=1)
        
    return train_s,test_s

def get_cfa_test(train,test,request):
    cfa_table = gen_count_table(train,request)
#     print(cfa_table.index)
    if len(request) == 2:
        test_s = test.apply(cfa_for_test_1,args=(cfa_table,request),axis=1)
    else:
        test_s = test.apply(cfa_for_test_2,args=(cfa_table,request),axis=1)        
    return test_s

def main_cfa(train,test):
    request_list = [["Anon Student Id", "Unit"],["Anon Student Id"],]
    CFAR_features = [ nameOfCFAR(v) for v in request_list ]
    for i in range(len(CFAR_features)):
        col_name = CFAR_features[i]
        train[col_name],test[col_name] = get_cfa_both(train,test,request_list[i])
    return train,test

def cfa_test(train,test):
    request_list = [["Anon Student Id", "Unit"],["Anon Student Id"],]
    CFAR_features = [ nameOfCFAR(v) for v in request_list ]
    for i in range(len(CFAR_features)):
        col_name = CFAR_features[i]
        test[col_name] = get_cfa_test(train,test,request_list[i])
    return test
    

# Ability Feature

In [172]:
# the relation bewtween Correct answer and the number of hint
def student_ability_KC_hint(row):
    up = row["Corrects"]
    low = np.exp(row["Hints"])
    return up/low

# the relation bewtween Correct answer and the KC's frequency
def student_ability_KC_frequency(row):
    up = row["Corrects"]
    oppo = row["Opportunity(Default)"]
    if type(oppo) != str:
        oppo = 0
    else:
        oppo = oppo.split("~~")
        oppo = [int(x) for x in oppo]
        oppo = min(oppo)    
    return up / (oppo + int(row["KC History Today"] + 1))


def get_avg_KCF(row,table):
    Sid = row["Anon Student Id"]
    KC = row["KC(Default)"]
    try:
        return table.loc[Sid].loc[KC]['AVG_KCF']
    except KeyError:
        return 0
        

def get_avg_KCH(row,table):
    Sid = row["Anon Student Id"]
    KC = row["KC(Default)"]
    try:
        return table.loc[Sid].loc[KC]['AVG_KCH']
    except KeyError:
        return 0
    
def gen_ability_feature(data):
    data["KC_F"] = data.apply(student_ability_KC_frequency,axis = 1)
    data["KC_H"] = data.apply(student_ability_KC_hint,axis = 1)

    sum_KCH_KCF = data.groupby(["Anon Student Id","KC(Default)"])
    sum_KCH_KCF = sum_KCH_KCF.aggregate(np.sum)
    temp_count = sum_KCH_KCF["Corrects"] + sum_KCH_KCF["Incorrects"]
    sum_KCH_KCF["count"] = temp_count
    sum_KCH_KCF["AVG_KCF"] = sum_KCH_KCF["KC_F"] / sum_KCH_KCF["count"]
    sum_KCH_KCF["AVG_KCH"] = sum_KCH_KCF["KC_H"] / sum_KCH_KCF["count"]
    sum_KCH_KCF = sum_KCH_KCF[["AVG_KCH","AVG_KCF"]]
    
    return sum_KCH_KCF

def main_ability(train,test):
    sum_KCH_KCF = gen_ability_feature(train)
    train["ability from KC and Hints"] = train.apply(get_avg_KCH,args=(sum_KCH_KCF,),axis = 1)
    train["ability from KC and Frequency"] = train.apply(get_avg_KCF,args=(sum_KCH_KCF,),axis = 1)
    test["ability from KC and Hints"] = test.apply(get_avg_KCH,args=(sum_KCH_KCF,),axis = 1)
    test["ability from KC and Frequency"] = test.apply(get_avg_KCF,args=(sum_KCH_KCF,),axis = 1)
    return train,test

In [186]:
def pv_norm(row):
    return row["Problem View"] / (row["Problem View"] + 1)

def opp_norm(row):
    col = "Opportunity(Default)"
    v = np.asarray([int(s) if s.lower() != "nan" else 0 for s in str(row[col]).split("~~")])
    m = np.min(v)
    return m / (m+1)
    


In [185]:
# normalize problem view and opportunity
train_df["Problem View(Norm)"] = train_df.apply(pv_norm, axis="columns")
train_df["Opportunity(Norm)"] = train_df.apply(opp_norm, axis="columns")

test_df["Problem View(Norm)"] = test_df.apply(pv_norm, axis="columns")
test_df["Opportunity(Norm)"] = test_df.apply(opp_norm, axis="columns")

In [None]:
# add cfar
train_df,test_df = main_cfa(train_df,test_df)

In [None]:
# add historical data
train_df,test_df = temp_main(train_df,test_df)

In [None]:
# add ability features
trrain_df,test_df = main_ability(train_df,test_df)

In [187]:
# save data
train_df.to_csv("data/agg_train.csv",sep='\t')
test_df.to_csv("data/agg_test.csv",sep='\t')