In [1]:
import json
import csv
from collections import Counter 
from textstat.textstat import textstat
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from wordfreq import word_frequency

In [2]:
tokenizer = RegexpTokenizer(r'\w+')
with open('z_reading_data.json') as json_data:
    reading_data = json.load(json_data)

In [3]:
def get_word_freq(word):
    return word_frequency(word, 'en') * 1e6
get_word_freq('hello')

47.863009232263806

In [4]:
reading_data_hash = {}
i = 0
for item in reading_data:
    unique_word_list = list(map(lambda x: x.lower(), tokenizer.tokenize(item['text'])))
    word_freq_list = []
    for word in sorted(list(set(unique_word_list))):
        word_freq = word_frequency(word, 'en') * 1e6
        word_freq_list.append([word, word_freq])
    reading = {
        'text': item['text'],
        'flesch_kincaid': textstat.flesch_reading_ease(item['text']),
        'unique_word_count': len(set(unique_word_list)),
        'word_freq_list': word_freq_list
    }
    reading_data_hash[item['_id']] = reading
    i+=1
    if i>6:
        break

In [5]:
print(len(reading_data_hash))

7


In [6]:
reading_data_dfs = []
for key, item in reading_data_hash.items():
    reading_data_df = pd.DataFrame(item['word_freq_list'])
    reading_data_dfs.append(reading_data_df)

In [7]:
def generate_reading_data(_max, features_count):
    increment = math.ceil(_max/features_count)
    reading_label_row = []
    i = 0
    while i < _max:
        reading_label_row.append('reading-' + str(i) + '-' + str(i+increment))
        i += increment
    reading_data_rows = []

    for key, item in reading_data_hash.items():
        base_row_data = [0 for _ in range(0, _max, increment)]
        for item in item['word_freq_list']:
            for index, _range in enumerate(reading_label_row):
                low = int(_range.split('-')[1])
                high = int(_range.split('-')[2])
                if (item[1] <= high and item[1] > low):
                    base_row_data[index] += 1
        reading_data_rows.append(base_row_data)
    return reading_label_row, reading_data_rows

In [8]:
_max = 56235
features_count = 100
b, a = generate_reading_data(_max, features_count)
pd.DataFrame(a).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,12,5,2,2,2,3,0,2,0,1,...,0,0,0,0,0,0,0,0,0,1
1,32,8,6,3,5,4,2,0,0,2,...,0,0,0,0,0,0,0,0,0,1
2,81,14,8,5,6,3,3,2,0,2,...,0,0,0,0,0,0,0,0,0,1
3,112,8,8,4,7,3,3,1,1,3,...,0,0,0,0,0,0,0,0,0,1
4,115,15,8,6,6,5,4,2,2,2,...,0,0,0,0,0,0,0,0,0,1


In [9]:
def generate_user_data(_max, features_count):
    with open('data-usable-1.json') as json_data:
        user_data = json.load(json_data)
    increment = math.ceil(_max/features_count)
    user_label_row = []
    i = 0
    while i < _max:
        user_label_row.append('user-' + str(i) + '-' + str(i+increment))
        i += increment
    user_data_rows = []
    i = 0
    for user in user_data['users']:
        user_id = i
        i = i + 1
        j = 0
        base_row_data = [0 for _ in range(0, _max, increment)]
        for performance in user['performances']:
            reading_id = j
            performance_id = performance['performance']['_id']
            knowns = list(map(lambda x: x['content'].lower(),  performance['knonws']))
            unknown_list = list(set(knowns))
            for item in unknown_list:
                for index, _range in enumerate(user_label_row):
                    low = int(_range.split('-')[1])
                    high = int(_range.split('-')[2])
                    word_freq = get_word_freq(item)
                    if ( word_freq <= high and word_freq > low):
                        base_row_data[index] += 1
            j = j + 1
        user_data_rows.append(base_row_data)
    return user_label_row, user_data_rows

In [10]:
_max = 56235
features_count = 100
b, a = generate_user_data(_max, features_count)
pd.DataFrame(a).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,31,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,42,2,3,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
def generate_final_data(_max, features_count):
    with open('data-usable-1.json') as json_data:
        user_data = json.load(json_data)
    user_label_row, user_data_rows = generate_user_data(_max, features_count)
    reading_label_row, reading_data_rows = generate_reading_data(_max, features_count)
    final_label_row = reading_label_row + user_label_row + ['relative_difficulty']
    final_data_rows = []
    i = 0 
    for user in user_data['users']:
        user_id = i
        i = i + 1
        j = 0
        for performance in user['performances']:
            reading_id = j
            performance_id = performance['performance']['_id']
            knowns = list(map(lambda x: x['content'].lower(),  performance['knonws']))
            unknown_count = len(list(set(knowns)))
            unique_word_count = reading_data_hash[performance['performance']['readingId']]['unique_word_count']
            difficulty = reading_data_hash[performance['performance']['readingId']]['flesch_kincaid']
            relative_difficulty = unknown_count/unique_word_count
            data_row = reading_data_rows[reading_id] + user_data_rows[user_id] + [relative_difficulty]
    #         data_row = [relative_difficulty, unknown_count, unique_word_count, total_counts[user_id], difficulty]
            final_data_rows.append(data_row)
            j = j + 1
    return final_label_row, final_data_rows

In [12]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

_max = 56235
features_count = 100
max_r2_train = 0
max_r2_test = 0
for features_count in range(5, 50):
    final_label_row, final_data_rows = generate_final_data(_max, features_count)
    data = pd.DataFrame(final_data_rows)
    data.columns = final_label_row
    data = data.loc[:, (data != 0).any(axis=0)]
    features = data.columns[:-1]
    target = 'relative_difficulty'

    reg = LinearRegression()
    for i in range(5,features_count) :
        rfe = RFE(reg, i)
        rfe = rfe.fit(data[features], data[target])
        filtered = data[features][data[features].columns[rfe.support_]].columns
        X_train, X_test, y_train, y_test = train_test_split(data[filtered], data[target], test_size=0.3, random_state=0)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_train)
        r2_train = reg.score(X_train, y_train)
        if r2_train > max_r2_train:
            max_r2_train = r2_train
            print('train set max r2: ' + str(max_r2_train) + ' at ' + str(i) + ' features: ' + str(features_count))
        y_pred = reg.predict(X_test)
        r2_test = reg.score(X_test, y_test)
        if r2_test > max_r2_test:
            max_r2_train = r2_train
            print('test set max r2: ' + str(max_r2_test) + ' at ' + str(i) + ' features: ' + str(features_count))

train set max r2: 0.518842426931 at 5 features: 6
test set max r2: 0 at 5 features: 6
test set max r2: 0 at 5 features: 7
train set max r2: 0.518933877676 at 6 features: 7
test set max r2: 0 at 6 features: 7
test set max r2: 0 at 5 features: 8
train set max r2: 0.535761737008 at 6 features: 8
test set max r2: 0 at 6 features: 8
train set max r2: 0.535776174974 at 7 features: 8
test set max r2: 0 at 7 features: 8
test set max r2: 0 at 5 features: 9
train set max r2: 0.536417791805 at 6 features: 9
test set max r2: 0 at 6 features: 9
train set max r2: 0.536435881761 at 7 features: 9
test set max r2: 0 at 7 features: 9
train set max r2: 0.536581475756 at 8 features: 9
test set max r2: 0 at 8 features: 9
test set max r2: 0 at 5 features: 10
train set max r2: 0.0919933207208 at 6 features: 10
test set max r2: 0 at 6 features: 10
train set max r2: 0.536137000913 at 7 features: 10
test set max r2: 0 at 7 features: 10
train set max r2: 0.53628121314 at 8 features: 10
test set max r2: 0 at 8 fe

train set max r2: 0.540438251332 at 12 features: 20
test set max r2: 0 at 12 features: 20
train set max r2: 0.540448543639 at 13 features: 20
test set max r2: 0 at 13 features: 20
train set max r2: 0.540473564214 at 14 features: 20
test set max r2: 0 at 14 features: 20
test set max r2: 0 at 15 features: 20
train set max r2: 0.540473564214 at 16 features: 20
test set max r2: 0 at 16 features: 20
test set max r2: 0 at 17 features: 20
test set max r2: 0 at 18 features: 20
test set max r2: 0 at 19 features: 20
test set max r2: 0 at 5 features: 21
train set max r2: 0.098283361749 at 6 features: 21
test set max r2: 0 at 6 features: 21
train set max r2: 0.0987507099723 at 7 features: 21
test set max r2: 0 at 7 features: 21
test set max r2: 0 at 8 features: 21
train set max r2: 0.0988981416851 at 9 features: 21
test set max r2: 0 at 9 features: 21
train set max r2: 0.0997671624332 at 10 features: 21
test set max r2: 0 at 10 features: 21
test set max r2: 0 at 11 features: 21
train set max r2: 0

train set max r2: 0.540195011308 at 14 features: 27
test set max r2: 0 at 14 features: 27
train set max r2: 0.540204679142 at 15 features: 27
test set max r2: 0 at 15 features: 27
train set max r2: 0.540215692528 at 16 features: 27
test set max r2: 0 at 16 features: 27
test set max r2: 0 at 17 features: 27
test set max r2: 0 at 18 features: 27
test set max r2: 0 at 19 features: 27
test set max r2: 0 at 20 features: 27
test set max r2: 0 at 21 features: 27
test set max r2: 0 at 22 features: 27
test set max r2: 0 at 23 features: 27
test set max r2: 0 at 24 features: 27
test set max r2: 0 at 25 features: 27
test set max r2: 0 at 26 features: 27
test set max r2: 0 at 5 features: 28
test set max r2: 0 at 6 features: 28
train set max r2: 0.059791458678 at 7 features: 28
test set max r2: 0 at 7 features: 28
train set max r2: 0.127875586872 at 8 features: 28
test set max r2: 0 at 8 features: 28
train set max r2: 0.130899306669 at 9 features: 28
test set max r2: 0 at 9 features: 28
train set ma

test set max r2: 0 at 5 features: 33
train set max r2: 0.104691711475 at 6 features: 33
test set max r2: 0 at 6 features: 33
train set max r2: 0.107891676542 at 7 features: 33
test set max r2: 0 at 7 features: 33
test set max r2: 0 at 8 features: 33
train set max r2: 0.109416645036 at 9 features: 33
test set max r2: 0 at 9 features: 33
train set max r2: 0.182502182716 at 10 features: 33
test set max r2: 0 at 10 features: 33
train set max r2: 0.202831498794 at 11 features: 33
test set max r2: 0 at 11 features: 33
train set max r2: 0.2029738663 at 12 features: 33
test set max r2: 0 at 12 features: 33
train set max r2: 0.203326305448 at 13 features: 33
test set max r2: 0 at 13 features: 33
train set max r2: 0.203793233223 at 14 features: 33
test set max r2: 0 at 14 features: 33
train set max r2: 0.205701948246 at 15 features: 33
test set max r2: 0 at 15 features: 33
train set max r2: 0.540972777439 at 16 features: 33
test set max r2: 0 at 16 features: 33
test set max r2: 0 at 17 features:

test set max r2: 0 at 21 features: 37
test set max r2: 0 at 22 features: 37
test set max r2: 0 at 23 features: 37
test set max r2: 0 at 24 features: 37
test set max r2: 0 at 25 features: 37
test set max r2: 0 at 26 features: 37
test set max r2: 0 at 27 features: 37
test set max r2: 0 at 28 features: 37
test set max r2: 0 at 29 features: 37
test set max r2: 0 at 30 features: 37
test set max r2: 0 at 31 features: 37
test set max r2: 0 at 32 features: 37
test set max r2: 0 at 33 features: 37
test set max r2: 0 at 34 features: 37
test set max r2: 0 at 35 features: 37
test set max r2: 0 at 36 features: 37
test set max r2: 0 at 5 features: 38
test set max r2: 0 at 6 features: 38
train set max r2: 0.218063766022 at 7 features: 38
test set max r2: 0 at 7 features: 38
train set max r2: 0.218177564796 at 8 features: 38
test set max r2: 0 at 8 features: 38
train set max r2: 0.218598582795 at 9 features: 38
test set max r2: 0 at 9 features: 38
train set max r2: 0.218616589044 at 10 features: 38
te

test set max r2: 0 at 5 features: 42
test set max r2: 0 at 6 features: 42
train set max r2: 0.162772881727 at 7 features: 42
test set max r2: 0 at 7 features: 42
train set max r2: 0.163322593822 at 8 features: 42
test set max r2: 0 at 8 features: 42
test set max r2: 0 at 9 features: 42
train set max r2: 0.220001419269 at 10 features: 42
test set max r2: 0 at 10 features: 42
train set max r2: 0.220001422277 at 11 features: 42
test set max r2: 0 at 11 features: 42
train set max r2: 0.223298703313 at 12 features: 42
test set max r2: 0 at 12 features: 42
test set max r2: 0 at 13 features: 42
train set max r2: 0.22359907366 at 14 features: 42
test set max r2: 0 at 14 features: 42
train set max r2: 0.224346743398 at 15 features: 42
test set max r2: 0 at 15 features: 42
train set max r2: 0.540790845491 at 16 features: 42
test set max r2: 0 at 16 features: 42
train set max r2: 0.540795190725 at 17 features: 42
test set max r2: 0 at 17 features: 42
train set max r2: 0.540821197303 at 18 feature

test set max r2: 0 at 43 features: 45
test set max r2: 0 at 44 features: 45
test set max r2: 0 at 5 features: 46
train set max r2: 0.153276792577 at 6 features: 46
test set max r2: 0 at 6 features: 46
train set max r2: 0.239923673918 at 7 features: 46
test set max r2: 0 at 7 features: 46
test set max r2: 0 at 8 features: 46
train set max r2: 0.240964745683 at 9 features: 46
test set max r2: 0 at 9 features: 46
train set max r2: 0.242475353785 at 10 features: 46
test set max r2: 0 at 10 features: 46
train set max r2: 0.243323715135 at 11 features: 46
test set max r2: 0 at 11 features: 46
train set max r2: 0.244910048757 at 12 features: 46
test set max r2: 0 at 12 features: 46
train set max r2: 0.245402112764 at 13 features: 46
test set max r2: 0 at 13 features: 46
train set max r2: 0.249193105037 at 14 features: 46
test set max r2: 0 at 14 features: 46
train set max r2: 0.249734218866 at 15 features: 46
test set max r2: 0 at 15 features: 46
test set max r2: 0 at 16 features: 46
train se

train set max r2: 0.270254503383 at 17 features: 49
test set max r2: 0 at 17 features: 49
test set max r2: 0 at 18 features: 49
train set max r2: 0.541594866848 at 19 features: 49
test set max r2: 0 at 19 features: 49
train set max r2: 0.541968445632 at 20 features: 49
test set max r2: 0 at 20 features: 49
test set max r2: 0 at 21 features: 49
test set max r2: 0 at 22 features: 49
train set max r2: 0.541968445632 at 23 features: 49
test set max r2: 0 at 23 features: 49
test set max r2: 0 at 24 features: 49
test set max r2: 0 at 25 features: 49
test set max r2: 0 at 26 features: 49
test set max r2: 0 at 27 features: 49
test set max r2: 0 at 28 features: 49
test set max r2: 0 at 29 features: 49
test set max r2: 0 at 30 features: 49
test set max r2: 0 at 31 features: 49
test set max r2: 0 at 32 features: 49
test set max r2: 0 at 33 features: 49
test set max r2: 0 at 34 features: 49
test set max r2: 0 at 35 features: 49
test set max r2: 0 at 36 features: 49
test set max r2: 0 at 37 feature

In [13]:
final_label_row, final_data_rows = generate_final_data(_max, 49)
data = pd.DataFrame(final_data_rows)
data.columns = final_label_row
data = data.loc[:, (data != 0).any(axis=0)]
features = data.columns[:-1]
target = 'relative_difficulty'
rfe = RFE(reg, 20)
rfe = rfe.fit(data[features], data[target])
filtered = data[features][data[features].columns[rfe.support_]].columns
print(filtered)

Index(['reading-1148-2296', 'reading-2296-3444', 'reading-4592-5740',
       'reading-5740-6888', 'reading-10332-11480', 'reading-12628-13776',
       'reading-26404-27552', 'user-0-1148', 'user-1148-2296',
       'user-2296-3444', 'user-3444-4592', 'user-5740-6888', 'user-6888-8036',
       'user-9184-10332', 'user-10332-11480', 'user-11480-12628',
       'user-22960-24108', 'user-25256-26404', 'user-26404-27552',
       'user-55104-56252'],
      dtype='object')


In [14]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from math import sqrt
from sklearn.metrics import mean_squared_error
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LinearRegression()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(modelCV, data[filtered], data[target], cv=kfold, scoring=scoring)
rmse = sqrt(abs(results.mean()))
print("10-fold cross validation average accuracy: %.3f" % (rmse))

10-fold cross validation average accuracy: 0.014


In [187]:
X_train, X_test, y_train, y_test = train_test_split(data[filtered], data[target], test_size=0.25)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_truth = y_test.values
rmse = sqrt(mean_squared_error(y_truth, y_pred))
print(rmse)

0.01204245610578532


In [176]:
def percentCorrect(prediction, ground_truth, bound_percent):
    in_bound = 0
    total_non_zero = 0
    for index, x in enumerate(prediction):
        if ground_truth[index] != 0:
            if (x <= ground_truth[index] * (1 + bound_percent) and x >= ground_truth[index] * (1 - bound_percent)):
                in_bound += 1
            total_non_zero += 1
    return in_bound/total_non_zero

def percentCorrectNumeric(prediction, ground_truth, bound):
    in_bound = 0
    for index, x in enumerate(prediction):
        if (x <= prediction[index] + bound and x >= ground_truth[index] - bound):
            in_bound = in_bound + 1
    return in_bound/len(prediction)

In [190]:
interval10 = "{0:.2f}%".format(percentCorrect(y_pred, y_truth, 0.5) * 100)
print(interval10)

67.16%


In [None]:
from sklearn.preprocessing import PolynomialFeatures
polynomial_features = PolynomialFeatures(degree=degrees[2], include_bias=False)