# Rule-based model

In [None]:
# !python -m spacy download es_core_news_md

In [2]:
# import
import json
import pandas as pd
import altair as alt
from collections import defaultdict

from sklearn.metrics import accuracy_score

## Load data

In [16]:
# read train val and test data
with open('../data/train_features.json', 'r') as f:
    train_feat = json.load(f)
with open('../data/val_features.json', 'r') as f:
    val_feat = json.load(f)
with open('../data/test_features.json', 'r') as f:
    test_feat = json.load(f)
    
train_feat_df = pd.DataFrame(train_feat)
val_feat_df = pd.DataFrame(val_feat)
test_feat_df = pd.DataFrame(test_feat)


X_train = train_feat_df.drop(['level'], axis=1)
X_val = val_feat_df.drop(['level'], axis=1)
X_test = test_feat_df.drop(['level'], axis=1)

y_train = train_feat_df['level'].tolist()
y_val = val_feat_df['level'].tolist()
y_test = test_feat_df['level'].tolist()

## Functions

In [4]:
def get_feature_average(feature_name):
    '''
    Given the name of a feature, return the average value of that feature for A1, A2, and B level texts respectively.
    
    Input:
    feature_name: (str) A key in the feature matrix
    
    Return:
    (None)
    '''
    feature_name_A1 = []
    feature_name_A2 = []
    feature_name_B = []
    
    for i, label in enumerate(train_feat['level']):
        if label == 'A1':
            feature_name_A1.append(train_feat[feature_name][i])
        elif label == 'A2':
            feature_name_A2.append(train_feat[feature_name][i])
        elif label == 'B':
            feature_name_B.append(train_feat[feature_name][i])
        else:
            print(f'Error at: {i}, label {label}')
    
    print(f'Average {feature_name} for A1 texts: {sum(feature_name_A1)/len(feature_name_A1)}')
    print(f'Average {feature_name} for A2 texts: {sum(feature_name_A2)/len(feature_name_A2)}')
    print(f'Average {feature_name} for B texts: {sum(feature_name_B)/len(feature_name_B)}')

In [5]:
def predict(feat_dict, threshold_dict):
    '''
    Given a feature dictionary and thresholds for prediction, return a list of predicted values.
    
    Input:
    feat_dict: (dict) A feature matrix of the format {feature1: [value1, value2, ...], feature2: [value1, value2, ...], ...}
    threshold_dict: (dict) A dictionary containing thresholds for the rule-based prediction. The dictionary should be structured as follows:
        {feature1: {A1_A2: 0.5, A2_B: 0.5}, feature2: {A1_A2: 0.3, A2_B: 0.3}, ...}, 
        where A1_A2 points to the threshold that differentiates A1 and A2 level.
    
    Return:
    (list) A list of predictions
    '''
    preds = []
    for i in range(len(feat_dict['preprocessed_text'])):
        if feat_dict['proportion_of_A_level_types'][i] > threshold_dict['proportion_of_A_level_types']['A1_A2'] \
        and feat_dict['proportion_of_A_level_tokens'][i] > threshold_dict['proportion_of_A_level_tokens']['A1_A2']:
            preds.append('A1')    
        elif feat_dict['proportion_of_A_level_types'][i] < threshold_dict['proportion_of_A_level_types']['A2_B'] \
        and feat_dict['proportion_of_A_level_tokens'][i] < threshold_dict['proportion_of_A_level_tokens']['A2_B']:
            preds.append('B')
        else:
            preds.append('A2')
    return preds

## Proportion of A level types

In [6]:
get_feature_average('proportion_of_A_level_types')

Average proportion_of_A_level_types for A1 texts: 0.3984681007030331
Average proportion_of_A_level_types for A2 texts: 0.30988777298143
Average proportion_of_A_level_types for B texts: 0.240087423883735


## Proportion of A level tokens

In [7]:
get_feature_average('proportion_of_A_level_tokens')

Average proportion_of_A_level_tokens for A1 texts: 0.4496423698117422
Average proportion_of_A_level_tokens for A2 texts: 0.4053670055676591
Average proportion_of_A_level_tokens for B texts: 0.3039061349513798


## Proportion of tenses (not used)

In [8]:
# future tense
get_feature_average('Fut')

Average Fut for A1 texts: 0.018207738779754044
Average Fut for A2 texts: 0.038297377547088694
Average Fut for B texts: 0.03598815179880061


In [9]:
# past tense
get_feature_average('Past')

Average Past for A1 texts: 0.11515991489032376
Average Past for A2 texts: 0.2746258069339504
Average Past for B texts: 0.22553401711675003


In [10]:
# present tense
get_feature_average('Pres')

Average Pres for A1 texts: 0.825997669319173
Average Pres for A2 texts: 0.5621140348279573
Average Pres for B texts: 0.5883038516116565


In [11]:
# imperative tense
get_feature_average('Imp')

Average Imp for A1 texts: 0.04063467701074908
Average Imp for A2 texts: 0.12496278069100375
Average Imp for B texts: 0.14197725816131687


## Predict

In [12]:
# try all combinations of thresholds
a = [0.30, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39]
b = [0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30]
c = [0.40, 0.41, 0.42, 0.43, 0.44]
d = [0.30, 0.32, 0.34, 0.36, 0.38, 0.40]

best_score = 0
best_comb = []

for a_ in a:
    for b_ in b:
        for c_ in c:
            for d_ in d:
                threshold_dict = {'proportion_of_A_level_types': {'A1_A2':a_, 'A2_B':b_}, 
                                  'proportion_of_A_level_tokens': {'A1_A2':c_, 'A2_B':d_}}
                train_preds = predict(train_feat, threshold_dict)
                score = accuracy_score(train_preds, y_train)
                if score > best_score:
                    best_score = score
                    best_comb = [a_, b_, c_, d_]

In [13]:
print(f'Best score: {best_score}')
print(f'Best combination of thresholds: {best_comb}')

Best score: 0.622568093385214
Best combination of thresholds: [0.3, 0.3, 0.4, 0.4]


In [17]:
threshold_dict = {'proportion_of_A_level_types': {'A1_A2':0.3, 'A2_B':0.3}, 
                  'proportion_of_A_level_tokens': {'A1_A2':0.4, 'A2_B':0.4}}
train_preds = predict(train_feat, threshold_dict)
val_preds = predict(val_feat, threshold_dict)
test_preds = predict(test_feat, threshold_dict)

In [18]:
print('Train accuracy:', accuracy_score(train_preds, y_train))
print('Validation accuracy:', accuracy_score(val_preds, y_val))
print('Test accuracy:', accuracy_score(test_preds, y_test))

Train accuracy: 0.622568093385214
Validation accuracy: 0.59375
Test accuracy: 0.53125
