In [35]:
import pandas as pd

from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

algos = [
    SVD(),
    SVDpp(), 
    KNNBasic(), 
    KNNWithMeans(), 
    KNNWithZScore(), 
    KNNBaseline(), 
    NormalPredictor(), 
    BaselineOnly(),
    SlopeOne()
]

algo_names = [
    'SVD',
    'SVDpp', 
    'KNNBasic', 
    'KNNWithMeans', 
    'KNNWithZScore',
    'KNNBaseline', 
    'NormalPredictor', 
    'BaselineOnly',
    'SlopeOne'
]

### predictions

In [39]:
rec_df = pd.read_csv('z_rec_pred.csv')
rec_df = rec_df.sample(frac=1).reset_index(drop=True)
truth_df = pd.read_csv('z_truth.csv')
truth_df = truth_df['reading_6_is_harder'].values

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(rec_df[['user_id', 'reading_id', 'relative_difficulty']], reader)
trainset = data.build_full_trainset()
for index, algo in enumerate(algos):
    algo.fit(data.build_full_trainset())
    print(algo_names[index])
    right = 0
    for i in range(239, 249):
        uid = i
        preds = []
        for j in range(5, 7):
            iid = j
            pred = algo.predict(uid, iid, r_ui=0)
            preds.append(pred.est)
        if len(preds) == 2:
            predicted = preds[0] < preds[1]
            print(predicted, truth_df[i-239])
            if predicted == truth_df[i-239]:
                right += 1
            
    print('percentage: ', right/10)

SVD
True True
True False
True True
False True
True False
False True
True True
True True
True True
True True
percentage:  0.6
SVDpp
False True
False False
False True
False True
False False
True True
False True
False True
False True
False True
percentage:  0.3
Computing the msd similarity matrix...
Done computing similarity matrix.
KNNBasic
True True
True False
True True
True True
True False
True True
True True
True True
True True
True True
percentage:  0.8
Computing the msd similarity matrix...
Done computing similarity matrix.
KNNWithMeans
False True
False False
False True
False True
False False
False True
False True
False True
False True
False True
percentage:  0.2
Computing the msd similarity matrix...
Done computing similarity matrix.
KNNWithZScore
False True
False False
False True
False True
False False
False True
False True
False True
False True
False True
percentage:  0.2
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
KNNBa

In [26]:
import json
import csv
from collections import Counter 
from textstat.textstat import textstat
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from wordfreq import word_frequency
def get_word_freq(word):
    return word_frequency(word, 'en') * 1e4
get_word_freq('hello')

0.47863009232263803

In [28]:
tokenizer = RegexpTokenizer(r'\w+')
with open('z_reading_data.json') as json_data:
    reading_data = json.load(json_data)
reading_data_hash = {}
i = 0
for item in reading_data:
    unique_word_list = list(map(lambda x: x.lower(), tokenizer.tokenize(item['text'])))
    word_freq_list = []
    for word in sorted(list(set(unique_word_list))):
        word_freq = get_word_freq(word)
        word_freq_list.append([word, word_freq])
    reading = {
        'text': item['text'],
        'flesch_kincaid': textstat.flesch_reading_ease(item['text']),
        'unique_word_count': len(set(unique_word_list)),
        'word_freq_list': word_freq_list
    }
    reading_data_hash[item['_id']] = reading
    i+=1
    if i>6:
        break

reading_data_dfs = []
for key, item in reading_data_hash.items():
    reading_data_df = pd.DataFrame(item['word_freq_list'])
    reading_data_dfs.append(reading_data_df)
with open('z_user_data.json') as json_data:
    user_data = json.load(json_data)
    
label_row = [ 'user_id', 'reading_id', 'relative_difficulty', 'weighted_r_d', 'unknown_count', 'unique_word_count' ]
data_rows = []
i = 0
for user in user_data['users']:
    user_id = i
    i = i + 1
    j = 0
    for performance in user['performances']:
        reading_id = j
        performance_id = performance['performance']['_id']
        knowns = performance['knonws']
        unknown_list = list(set(knowns))
        unknown_count = len(unknown_list)
        unique_word_count = reading_data_hash[performance['performance']['readingId']]['unique_word_count']
        relative_difficulty = unknown_count/unique_word_count
        weighted_unknown_count = 0
        for item in unknown_list:
            word_freq = get_word_freq(item)
            weighted_unknown_count += word_freq
        weighted_relative_difficulty = weighted_unknown_count/unique_word_count
        data_row = [user_id, reading_id, relative_difficulty, weighted_relative_difficulty, unknown_count, unique_word_count]
        data_rows.append(data_row)
        j = j + 1

In [29]:
print(len(reading_data_hash))

7


In [40]:
rec_df = pd.DataFrame(data_rows)
rec_df.columns = label_row
rec_df = rec_df.sample(frac=1).reset_index(drop=True)
truth_df = pd.read_csv('z_truth.csv')
truth_df = truth_df['reading_6_is_harder'].values

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(rec_df[['user_id', 'reading_id', 'weighted_r_d']], reader)
trainset = data.build_full_trainset()
for index, algo in enumerate(algos):
    algo.fit(data.build_full_trainset())
    print(algo_names[index])
    right = 0
    for i in range(239, 249):
        uid = i
        preds = []
        for j in range(5, 7):
            iid = j
            pred = algo.predict(uid, iid, r_ui=0)
            preds.append(pred.est)
        if len(preds) == 2:
            predicted = preds[0] < preds[1]
            print(predicted, truth_df[i-239])
            if predicted == truth_df[i-239]:
                right += 1
            
    print('percentage: ', right/10)

SVD
True True
False False
False True
False True
False False
True True
True True
True True
False True
True True
percentage:  0.7
SVDpp
False True
False False
False True
False True
False False
False True
True True
False True
False True
False True
percentage:  0.3
Computing the msd similarity matrix...
Done computing similarity matrix.
KNNBasic
True True
True False
True True
True True
False False
True True
True True
True True
True True
True True
percentage:  0.9
Computing the msd similarity matrix...
Done computing similarity matrix.
KNNWithMeans
True True
True False
True True
True True
False False
True True
True True
True True
True True
True True
percentage:  0.9
Computing the msd similarity matrix...
Done computing similarity matrix.
KNNWithZScore
False True
False False
False True
False True
False False
False True
False True
False True
False True
False True
percentage:  0.2
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
KNNBaselin