## Welcome

This notebook was used for the LTRC 2019 demo. 

It loads and analyzes various datasets and then creates a number of statistical models to see which ones perform better. Unsurprisingly, with more data, the combined model performed best. An additional model excluding two variables (age and L1) was output to be used publicly.

In [1]:
import re
import pandas as pd

In [2]:
from AES_function import analyze

In [3]:
raw_CFE = pd.read_excel('Kyle\'s model/CFE_dataset.xlsx')

# Load and Analyze FCE Data

Append the two essays into one and use the overall score. 

In [14]:
fce_analyzed = []
for i in range(0, len(raw_CFE)):
    row = raw_CFE.iloc[i]
    
    data = {}
#     data['essay1'] = row['1-just_text']
#     data['essay2'] = row['2-just_text']
    
    essays = row['1-just_text'] + ' ' + row['2-just_text']
    essays = re.sub('\n', ' ', essays)  # replace whitespace

#     data['combined_essays'] = essays
    data['age_code'] = row['age_code']
    data['language_id'] = row['language_id']
    data['score'] = row['score']  # Score is out of 40
    # Combine the row data with the feature_set data
    combined = {**data, **analyze(essays)}

    fce_analyzed.append(combined)
    
    if i % 100 == 0:
        print('completed', i)

completed 0
completed 100
completed 200
completed 300
completed 400
completed 500
completed 600
completed 700
completed 800
completed 900
completed 1000
completed 1100
completed 1200


# Load and Analyce ELC Data

In [7]:
elc_df = pd.read_excel('kyle_certified_clean_train_set.xlsx', sheet_name='Sheet1')

In [8]:
elc_analyzed = []
print(f'{len(elc_df)} to analyze!')

for i in range(0, len(elc_df)):
    row = elc_df.iloc[i]
    
    data = {}
    
    data['age_code'] = row['age_code']
    data['language_id'] = row['language_id']
    data['score'] = row['score']
    combined = {**data, **analyze(row['essay'])}

    elc_analyzed.append(combined)
    
    if i % 100 == 0:
        print('completed', i)

3616 to analyze!
completed 0
completed 100
completed 200
completed 300
completed 400
completed 500
completed 600
completed 700
completed 800
completed 900
completed 1000
completed 1100
completed 1200
completed 1300
completed 1400
completed 1500
completed 1600
completed 1700
completed 1800
completed 1900
completed 2000
completed 2100
completed 2200
completed 2300
completed 2400
completed 2500
completed 2600
completed 2700
completed 2800
completed 2900
completed 3000
completed 3100
completed 3200
completed 3300
completed 3400
completed 3500
completed 3600


# Model Building Function

In [33]:
from sklearn.model_selection import train_test_split
from sklearn import ensemble, metrics
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib
from sklearn.model_selection import KFold

In [34]:
def build_model(df, score_column, output='model'):
    """ `df` is the dataframe to be passed in
        `scores` is the name of the column you want to analyze as scores
        
        returns a model and prints general statistical data at time of calling"""
    
    scores = df[score_column]
    # make sure to only select the score column
    
    voulu = ['age_code', 'ari', 'avg_len_word', 'cli', 'conjunctions', 'cttr',
             'dcrs', 'determiners', 'dw', 'english_usage', 'fkg', 'fre',
             'function_ttr', 'gf', 'grammar_chk', 'language_id', 'lwf',
             'n_bigram_lemma_types', 'n_bigram_lemmas', 'n_trigram_lemma_types',
             'n_trigram_lemmas', 'ncontent_words', 'nfunction_words', 'nlemma_types',
             'nlemmas', 'noun_ttr', 'num_tokens', 'num_types', 'pct_rel_trigrams',
             'pct_transitions', 'rank_avg', 'rank_total', 's1', 's1a', 's1b', 's1c',
             's2', 's2a', 's2b', 's2c', 's3', 's3a', 's3b', 's3c', 's4', 's4a',
             's4b', 's4c', 'sent_density', 'spelling_perc', 'ttr']
    
    features = df[voulu]
    # make sure to only select the feature columns
    
    # there are a few responses with weird scores
    bad = []
    for i, value in enumerate(scores.values):
        try:
            float(value)
        except (ValueError, TypeError):
            bad.append(i)
    
    features = features.drop(index=bad)
    
    scores = scores.drop(index=bad)
    
    
    # create numpy arrays of values
    X = features.values
    y = scores.values
    
    kfolds = KFold(n_splits=12, shuffle=True)
    
    for train_index, test_index in kfolds.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    model = ensemble.GradientBoostingRegressor(
        n_estimators=500,
        learning_rate=0.02,
        max_depth=4,
        min_samples_leaf=9,
        max_features=0.3,
        loss='lad',
        random_state=0)
    
    # prints a bunch of data about the model
    print('`model.fit` results:')
    print(model.fit(X, y))
    
    train_error = mean_absolute_error(y_train, model.predict(X_train))
    test_error = mean_absolute_error(y_test, model.predict(X_test))

    print('Mean Absolute Error:')
    print('Train error:', train_error, sep='\t')
    print('Test error:', test_error, sep='\t')
    print()
    
    r2_train = metrics.r2_score(y_train, model.predict(X_train))
    r2_test = metrics.r2_score(y_test, model.predict(X_test))

    # r2 is the proportion of the variance in the scores that is predictable from the features
    print('r2 scores of both train/test:')
    print('r2_train:', r2_train, sep='\t')
    print('r2_test:', r2_test, sep='\t')
    print()
    
    if output == 'model':
        return model
    elif output == 'pickle':
        name = input("INPUT `{name}.pkl` and press [ENTER]")
        joblib.dump(model, name)
    

In [22]:
fce_dataframe = pd.DataFrame(fce_analyzed)

## CLC Data Test/Train Results

In [35]:
fce_gbr_model = build_model(fce_dataframe, 'score')

`model.fit` results:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.02, loss='lad', max_depth=4, max_features=0.3,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=9,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=0, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Mean Absolute Error:
Train error:	2.2845839249248145
Test error:	2.8586375357788802

r2 scores of both train/test:
r2_train:	0.6351009332989177
r2_test:	0.5993034515626057



In [36]:
def print_model_info(model, column_names):
#     print(len(model.feature_importances_))
    results = sorted(list(zip(column_names,
                          model.feature_importances_)),
                     key=lambda x: x[1], reverse=True)
    for k, v in results:
        print(k, v, sep='\t')

In [40]:
elc_df = pd.DataFrame(elc_analyzed)

In [48]:
elc_actual_scores = elc_df['score']

In [42]:
voulu = ['age_code', 'ari', 'avg_len_word', 'cli', 'conjunctions', 'cttr',
       'dcrs', 'determiners', 'dw', 'english_usage', 'fkg', 'fre',
       'function_ttr', 'gf', 'grammar_chk', 'language_id', 'lwf',
       'n_bigram_lemma_types', 'n_bigram_lemmas', 'n_trigram_lemma_types',
       'n_trigram_lemmas', 'ncontent_words', 'nfunction_words', 'nlemma_types',
       'nlemmas', 'noun_ttr', 'num_tokens', 'num_types', 'pct_rel_trigrams',
       'pct_transitions', 'rank_avg', 'rank_total', 's1', 's1a', 's1b', 's1c',
       's2', 's2a', 's2b', 's2c', 's3', 's3a', 's3b', 's3c', 's4', 's4a',
       's4b', 's4c', 'sent_density', 'spelling_perc', 'ttr']

In [43]:
elc_features = elc_df[voulu]

## Predicted ELC scores 

(predicted as iLexiR, which are out of 40)

In [44]:
fce_gbr_model.predict(elc_features.values)

array([25.89785554, 34.76541296, 33.77101161, ..., 15.18876042,
       15.48425496, 29.32852078])

In [54]:
elc_pred_scores = fce_gbr_model.predict(elc_features.values)
print('len', len(pred_scores))

len 3616


## Import regression metrics

In [47]:
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error,\
median_absolute_error, r2_score, mean_squared_log_error



In [92]:
feedback = {}
feedback['explained_variance_score'] = explained_variance_score(elc_actual_scores.values * 8, elc_pred_scores)
feedback['mean_absolute_error'] = mean_absolute_error(elc_actual_scores.values * 8, elc_pred_scores)
feedback['mean_squared_error'] = mean_squared_error(elc_actual_scores.values * 8, elc_pred_scores)
feedback['mean_squared_log_error'] = mean_squared_log_error(elc_actual_scores.values * 8, elc_pred_scores)
feedback['median_absolute_error'] = median_absolute_error(elc_actual_scores.values * 8, elc_pred_scores)





# Output Feedback Metrics

(How well does the CFE `iLexIR` data predict the BYU data?)

In [93]:
for k, v in feedback.items():
    print('{:25} {}'.format(k, v))

explained_variance_score  0.10406482613832191
mean_absolute_error       121.27637931234035
mean_squared_error        16979.74118458355
mean_squared_log_error    3.2062134157092905
median_absolute_error     122.61835951115071


## Correlation of actual scores to predicted scores

In [51]:
from numpy import corrcoef

In [57]:
corrcoef(elc_actual_scores.values, pred_scores)

array([[1.        , 0.70191912],
       [0.70191912, 1.        ]])

# Output Data for Visualization

In [58]:
d = {'actual_scores': elc_actual_scores.values, 'pred_scores': elc_pred_scores}
scores = pd.DataFrame(d)
scores.describe()

Unnamed: 0,actual_scores,pred_scores
count,3616.0,3616.0
mean,17.968833,22.613985
std,6.342027,3.983842
min,0.0,14.485268
25%,14.4,19.709239
50%,18.15,22.387664
75%,22.1,25.316536
max,40.0,35.786499


In [59]:
scores.corr()

Unnamed: 0,actual_scores,pred_scores
actual_scores,1.0,0.701919
pred_scores,0.701919,1.0


In [60]:
scores.to_csv('output_scores.csv')

# Try a `LinearRegression` model

In [71]:
from sklearn.linear_model import LinearRegression

Our two datasets as dataframes are stored as:
- `elc_df` and
- `fce_dataframe`

In [76]:
def linear_model(df, score_column, output='model'):
    """ `df` is the dataframe to be passed in
        `scores` is the name of the column you want to analyze as scores
        
        returns a model and prints general statistical data at time of calling"""
    
    scores = df[score_column]
    # make sure to only select the score column
    
    voulu = ['age_code', 'ari', 'avg_len_word', 'cli', 'conjunctions', 'cttr',
             'dcrs', 'determiners', 'dw', 'english_usage', 'fkg', 'fre',
             'function_ttr', 'gf', 'grammar_chk', 'language_id', 'lwf',
             'n_bigram_lemma_types', 'n_bigram_lemmas', 'n_trigram_lemma_types',
             'n_trigram_lemmas', 'ncontent_words', 'nfunction_words', 'nlemma_types',
             'nlemmas', 'noun_ttr', 'num_tokens', 'num_types', 'pct_rel_trigrams',
             'pct_transitions', 'rank_avg', 'rank_total', 's1', 's1a', 's1b', 's1c',
             's2', 's2a', 's2b', 's2c', 's3', 's3a', 's3b', 's3c', 's4', 's4a',
             's4b', 's4c', 'sent_density', 'spelling_perc', 'ttr']
    
    features = df[voulu]  # means `wanted` in french
    # make sure to only select the feature columns
    
    # there are a few responses with weird scores
    bad = []
    for i, value in enumerate(scores.values):
        try:
            float(value)
        except (ValueError, TypeError):
            bad.append(i)
    
    features = features.drop(index=bad)
    
    scores = scores.drop(index=bad)
    
    
    # create numpy arrays of values
    X = features.values
    y = scores.values
    
    kfolds = KFold(n_splits=12, shuffle=True)
    
    for train_index, test_index in kfolds.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    print('`model.fit` results:')
    # prints a bunch of data about the model
    model = LinearRegression().fit(X, y)
    print(model.fit(X, y))
    
    train_error = mean_absolute_error(y_train, model.predict(X_train))
    test_error = mean_absolute_error(y_test, model.predict(X_test))

    print('Mean Absolute Error:')
    print('Train error:', train_error, sep='\t')
    print('Test error:', test_error, sep='\t')
    print()
    
    r2_train = metrics.r2_score(y_train, model.predict(X_train))
    r2_test = metrics.r2_score(y_test, model.predict(X_test))

    # r2 is the proportion of the variance in the scores that is predictable from the features
    print('r2 scores of both train/test:')
    print('r2_train:', r2_train, sep='\t')
    print('r2_test:', r2_test, sep='\t')
    print()
    
    if output == 'model':
        return model
    elif output == 'pickle':
        name = input("INPUT `{name}.pkl` and press [ENTER]")
        joblib.dump(model, name)
    

## FCE data predicting ELC data (`LinearRegression`)

In [78]:
fce_linear = linear_model(fce_dataframe, 'score')

`model.fit` results:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
Mean Absolute Error:
Train error:	3.262040211073515
Test error:	3.3490489823818477

r2 scores of both train/test:
r2_train:	0.4244652793649637
r2_test:	0.3900985843158298



In [79]:
elc_values = elc_features
elc_targets = elc_actual_scores

fce_linear_preds = fce_linear.predict(elc_values)
fce_linear_preds

array([ 24.112961  ,  27.18838204,  31.36013423, ..., -22.90866452,
       -19.95565726,  28.45143507])

In [80]:
corrcoef(elc_targets, fce_linear_preds)

array([[1.        , 0.63506871],
       [0.63506871, 1.        ]])

# Build a combined model and test it

combine the analyzed CFE and BYU_ELC data and build a few models with it (using kfold validation):
- `GradientBoostingRegressor`

In [64]:
combined_model = elc_analyzed + fce_analyzed
len(combined_model)

4853

In [65]:
comb_analyzed = pd.DataFrame(combined_model)

In [66]:
comb_model = build_model(comb_analyzed, 'score')

`model.fit` results:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.02, loss='lad', max_depth=4, max_features=0.3,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=9,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=0, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Mean Absolute Error:
Train error:	2.403447835908283
Test error:	2.5535642426314715

r2 scores of both train/test:
r2_train:	0.8124452471700421
r2_test:	0.7852482046513279



## Combined Model 

Using linear regression

In [176]:
def combined_build_model_without_age_and_lang_id(df, score_column, output='model'):
    """ `df` is the dataframe to be passed in
        `scores` is the name of the column you want to analyze as scores
        
        returns a model and prints general statistical data at time of calling"""
    
    scores = df[score_column]
    # make sure to only select the score column
    
    voulu = ['ari', 'avg_len_word', 'cli', 'conjunctions', 'cttr',
             'dcrs', 'determiners', 'dw', 'english_usage', 'fkg', 'fre',
             'function_ttr', 'gf', 'grammar_chk', 'lwf',
             'n_bigram_lemma_types', 'n_bigram_lemmas', 'n_trigram_lemma_types',
             'n_trigram_lemmas', 'ncontent_words', 'nfunction_words', 'nlemma_types',
             'nlemmas', 'noun_ttr', 'num_tokens', 'num_types', 'pct_rel_trigrams',
             'pct_transitions', 'rank_avg', 'rank_total', 's1', 's1a', 's1b', 's1c',
             's2', 's2a', 's2b', 's2c', 's3', 's3a', 's3b', 's3c', 's4', 's4a',
             's4b', 's4c', 'sent_density', 'spelling_perc', 'ttr']
    
    features = df[voulu]
    # make sure to only select the feature columns
    
    # there are a few responses with weird scores
    bad = []
    for i, value in enumerate(scores.values):
        try:
            float(value)
        except (ValueError, TypeError):
            bad.append(i)
    
    features = features.drop(index=bad)
    
    scores = scores.drop(index=bad)
    
    
    # create numpy arrays of values
    X = features.values
    y = scores.values
    
    kfolds = KFold(n_splits=12, shuffle=True)
    
    for train_index, test_index in kfolds.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    model = ensemble.GradientBoostingRegressor(
        n_estimators=500,
        learning_rate=0.02,
        max_depth=4,
        min_samples_leaf=9,
        max_features=0.3,
        loss='ls',
        random_state=0)
    
    # prints a bunch of data about the model
    print('`model.fit` results:')
    print(model.fit(X, y))
    
    train_error = mean_absolute_error(y_train, model.predict(X_train))
    test_error = mean_absolute_error(y_test, model.predict(X_test))

    print('Mean Absolute Error:')
    print('Train error:', train_error, sep='\t')
    print('Test error:', test_error, sep='\t')
    print()
    
    r2_train = metrics.r2_score(y_train, model.predict(X_train))
    r2_test = metrics.r2_score(y_test, model.predict(X_test))

    # r2 is the proportion of the variance in the scores that is predictable from the features
    print('r2 scores of both train/test:')
    print('r2_train:', r2_train, sep='\t')
    print('r2_test:', r2_test, sep='\t')
    print()

    output = {}
    output['predictions'] = model.predict(X)
    output['actual_scores'] = y
    output['model'] = model
    
    return output
    

In [180]:
c_ls_model = combined_build_model_without_age_and_lang_id(comb_analyzed, 'score')

`model.fit` results:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.02, loss='ls', max_depth=4, max_features=0.3,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=9,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=0, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Mean Absolute Error:
Train error:	2.379653312177093
Test error:	2.4853308176068967

r2 scores of both train/test:
r2_train:	0.834073647432878
r2_test:	0.8002512288445411



In [178]:
import pickle

In [181]:
with open('elc_clc_combined_LRTC_model_13_march.pkl', 'wb') as f:
    pickle.dump(c_ls_model, f)

In [81]:
def combined_build_model(df, score_column, output='model'):
    """ `df` is the dataframe to be passed in
        `scores` is the name of the column you want to analyze as scores
        
        returns a model and prints general statistical data at time of calling"""
    
    scores = df[score_column]
    # make sure to only select the score column
    
    voulu = ['age_code', 'ari', 'avg_len_word', 'cli', 'conjunctions', 'cttr',
             'dcrs', 'determiners', 'dw', 'english_usage', 'fkg', 'fre',
             'function_ttr', 'gf', 'grammar_chk', 'language_id', 'lwf',
             'n_bigram_lemma_types', 'n_bigram_lemmas', 'n_trigram_lemma_types',
             'n_trigram_lemmas', 'ncontent_words', 'nfunction_words', 'nlemma_types',
             'nlemmas', 'noun_ttr', 'num_tokens', 'num_types', 'pct_rel_trigrams',
             'pct_transitions', 'rank_avg', 'rank_total', 's1', 's1a', 's1b', 's1c',
             's2', 's2a', 's2b', 's2c', 's3', 's3a', 's3b', 's3c', 's4', 's4a',
             's4b', 's4c', 'sent_density', 'spelling_perc', 'ttr']
    
    features = df[voulu]
    # make sure to only select the feature columns
    
    # there are a few responses with weird scores
    bad = []
    for i, value in enumerate(scores.values):
        try:
            float(value)
        except (ValueError, TypeError):
            bad.append(i)
    
    features = features.drop(index=bad)
    
    scores = scores.drop(index=bad)
    
    
    # create numpy arrays of values
    X = features.values
    y = scores.values
    
    kfolds = KFold(n_splits=12, shuffle=True)
    
    for train_index, test_index in kfolds.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    model = ensemble.GradientBoostingRegressor(
        n_estimators=500,
        learning_rate=0.02,
        max_depth=4,
        min_samples_leaf=9,
        max_features=0.3,
        loss='ls',
        random_state=0)
    
    # prints a bunch of data about the model
    print('`model.fit` results:')
    print(model.fit(X, y))
    
    train_error = mean_absolute_error(y_train, model.predict(X_train))
    test_error = mean_absolute_error(y_test, model.predict(X_test))

    print('Mean Absolute Error:')
    print('Train error:', train_error, sep='\t')
    print('Test error:', test_error, sep='\t')
    print()
    
    r2_train = metrics.r2_score(y_train, model.predict(X_train))
    r2_test = metrics.r2_score(y_test, model.predict(X_test))

    # r2 is the proportion of the variance in the scores that is predictable from the features
    print('r2 scores of both train/test:')
    print('r2_train:', r2_train, sep='\t')
    print('r2_test:', r2_test, sep='\t')
    print()

    output = {}
    output['predictions'] = model.predict(X)
    output['actual_scores'] = y
    output['model'] = model
    
    return output
    

In [83]:
# combined, least-squares model
c_ls_model = combined_build_model(comb_analyzed, 'score')

`model.fit` results:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.02, loss='ls', max_depth=4, max_features=0.3,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=9,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=0, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)
Mean Absolute Error:
Train error:	2.360482423992901
Test error:	2.32169958721344

r2 scores of both train/test:
r2_train:	0.8341340712062068
r2_test:	0.8575492864872207



In [84]:
corrcoef(c_ls_model['predictions'], c_ls_model['actual_scores'])

array([[1.        , 0.91516276],
       [0.91516276, 1.        ]])

In [85]:
d = {'c_ls_actual_scores': c_ls_model['actual_scores'], 
     'c_ls_pred_scores': c_ls_model['predictions']}
c_ls_scores = pd.DataFrame(d)
c_ls_scores.describe()

Unnamed: 0,c_ls_actual_scores,c_ls_pred_scores
count,4853.0,4853.0
mean,20.504492,20.504492
std,7.50194,6.603465
min,0.0,1.724722
25%,15.0,15.893082
50%,20.1,20.999084
75%,25.4,25.222517
max,40.0,39.386296


In [86]:
c_ls_scores.to_csv('6mar_c_ls_model_output_scores.csv')

# Demo of Combined Model

In [123]:
demo_set = pd.read_excel('demo_set.xlsx')

In [143]:
elc_demo = demo_set[demo_set['which'] == 'elc']

In [144]:
elc_demo_feats = []
for i in range(0, len(elc_demo)):
    row = elc_demo.iloc[i]
    
    data = {}
    
    data['age_code'] = row['age_code']
    data['language_id'] = row['language_id']
    data['score'] = row['score']
    combined = {**data, **analyze(row['essay'])}

    elc_demo_feats.append(combined)

elc_demo_df = pd.DataFrame(elc_demo_feats)
wanted_features = ['age_code', 'ari', 'avg_len_word', 'cli', 'conjunctions', 'cttr',
                   'dcrs', 'determiners', 'dw', 'english_usage', 'fkg', 'fre',
                   'function_ttr', 'gf', 'grammar_chk', 'language_id', 'lwf',
                   'n_bigram_lemma_types', 'n_bigram_lemmas', 'n_trigram_lemma_types',
                   'n_trigram_lemmas', 'ncontent_words', 'nfunction_words', 'nlemma_types',
                   'nlemmas', 'noun_ttr', 'num_tokens', 'num_types', 'pct_rel_trigrams',
                   'pct_transitions', 'rank_avg', 'rank_total', 's1', 's1a', 's1b', 's1c',
                   's2', 's2a', 's2b', 's2c', 's3', 's3a', 's3b', 's3c', 's4', 's4a',
                   's4b', 's4c', 'sent_density', 'spelling_perc', 'ttr']

elc_demo_features = elc_demo_df[wanted_features]
elc_demo_scores = elc_demo_df['score'].values

In [156]:
print('predict', 'actual', sep='\t')
for p, t in zip(c_ls_model['model'].predict(elc_demo_features), elc_demo_scores):
    print(round(p, 2), t, sep='\t')

predict	actual
9.95	5.0
3.77	5.05
16.48	20.0
15.93	20.0
29.91	33.25
31.83	33.25


In [127]:
clc_demo = demo_set[demo_set['which'] == 'clc']

In [154]:
clc_demo_feats = []
for i in range(0, len(elc_demo)):
    row = clc_demo.iloc[i]
    
    data = {}
    
    data['age_code'] = row['age_code']
    data['language_id'] = row['language_id']
    data['score'] = row['score']
    combined = {**data, **analyze(row['essay'])}

    clc_demo_feats.append(combined)

clc_demo_df = pd.DataFrame(clc_demo_feats)
wanted_features = ['age_code', 'ari', 'avg_len_word', 'cli', 'conjunctions', 'cttr',
                   'dcrs', 'determiners', 'dw', 'english_usage', 'fkg', 'fre',
                   'function_ttr', 'gf', 'grammar_chk', 'language_id', 'lwf',
                   'n_bigram_lemma_types', 'n_bigram_lemmas', 'n_trigram_lemma_types',
                   'n_trigram_lemmas', 'ncontent_words', 'nfunction_words', 'nlemma_types',
                   'nlemmas', 'noun_ttr', 'num_tokens', 'num_types', 'pct_rel_trigrams',
                   'pct_transitions', 'rank_avg', 'rank_total', 's1', 's1a', 's1b', 's1c',
                   's2', 's2a', 's2b', 's2c', 's3', 's3a', 's3b', 's3c', 's4', 's4a',
                   's4b', 's4c', 'sent_density', 'spelling_perc', 'ttr']

clc_demo_features = clc_demo_df[wanted_features]
clc_demo_scores = clc_demo_df['score'].values

In [157]:
print('predict', 'actual', sep='\t')
for p, t in zip(c_ls_model['model'].predict(clc_demo_features), clc_demo_scores):
    print(round(p, 2), t, sep='\t')

predict	actual
20.13	9.0
18.47	11.0
24.37	20.0
20.53	21.0
31.73	37.0
32.92	37.0
