### Load Libraries

In [18]:
import json
import csv
from collections import Counter 
from textstat.textstat import textstat
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Reading-wise Relative Difficulty

In [20]:
tokenizer = RegexpTokenizer(r'\w+')

with open('reading-1.json') as json_data:
    data = json.load(json_data)
    
label_row = [ 'reading_id', 'text', 'flesch_kincaid', 'unique_word_count' ]
reading_data = {}
for item in data:
    unique_word_list = list(map(lambda x: x.lower(), tokenizer.tokenize(item['text'])))
    reading = {
        'text': item['text'],
        'flesch_kincaid': textstat.flesch_reading_ease(item['text']),
        'unique_word_count': len(set(unique_word_list))
    }
    reading_data[item['_id']] = reading
reading_data

with open('data-usable-1.json') as json_data:
    data = json.load(json_data)
    
label_row = [ 'user_id', 'reading_id', 'performance_id', 'relative_difficulty', 'unknown_count', 'unique_word_count' ]
data_rows = []
data_rows.append(label_row)
i = 0
for user in data['users']:
    user_id = i
    i = i + 1
    j = 0
    for performance in user['performances']:
        reading_id = j
        performance_id = performance['performance']['_id']
#         unknown_count = len(performance['knonws'])
        knowns = list(map(lambda x: x['content'].lower(),  performance['knonws']))
        unknown_count = len(list(set(knowns)))
        unique_word_count = reading_data[performance['performance']['readingId']]['unique_word_count']
        relative_difficulty = unknown_count/unique_word_count
        data_row = [user_id, reading_id, performance_id, relative_difficulty, unknown_count, unique_word_count]
        data_rows.append(data_row)
        j = j + 1

with open("rec-with-percentage-no-dup.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(data_rows)


### sentences-wise

In [6]:
sent_df = pd.read_csv('sentences.csv')

with open('data-usable-1.json') as json_data:
    data = json.load(json_data)

label_row = [ 'user_id', 'sentence_id', 'unknown_count', 'relative_difficulty', 'unique_word_count' ]
data_rows = []
data_rows.append(label_row)
user_index = 0
for user in data['users']:
    knowns = []
    username = user_index
    for performance in user['performances']:
        reading_id = performance['performance']['readingId']
        for known in performance['knonws']:
            knowns.append(known['content'])
    
    for index, row in sent_df.iterrows():
        tokenizer = RegexpTokenizer(r'\w+')
        word_tokens = tokenizer.tokenize(row['text'])
        unique_word_count = len(word_tokens)
        unknown_count = 0
        for known in knowns:
            if known in word_tokens:
                unknown_count += 1
        relative_difficulty = unknown_count / unique_word_count
        data_rows.append([username, row['sentence_id'], unknown_count, relative_difficulty, unique_word_count])
        
    user_index += 1
    
with open("sentences-user-actions.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(data_rows)


### process readings.json

In [None]:
with open('reading.json') as json_data:
    data = json.load(json_data)
    
label_row = [ 'reading_id', 'lexile', 'flesch_kincaid' ]
data_rows = []
data_rows.append(label_row)
for reading in data:
    text = reading['text']
    reading_id = reading['_id']
    lexile_D = reading['lexileDifficulty']
    flesch_kincaid = textstat.flesch_reading_ease(text)
    data_rows.append([reading_id, lexile_D, flesch_kincaid])

# with open("reading.csv", "w") as f:
#     writer = csv.writer(f)
#     writer.writerows(data_rows)


### process user actions

In [None]:
with open('data-usable.json') as json_data:
    data = json.load(json_data)
    
label_row = [ 'username', 'reading_id', 'performance_id', 'unknown_count' ]
data_rows = []
data_rows.append(label_row)
for user in data['users']:
    username = user['user']['username']
    for performance in user['performances']:
        reading_id = performance['performance']['readingId']
        performance_id = performance['performance']['_id']
        unknown_count = len(performance['knonws'])
        data_row = [username, reading_id, performance_id, unknown_count]
        data_rows.append(data_row)

with open("user-actions.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(data_rows)


### For recommandation modal

In [None]:
with open('data-usable.json') as json_data:
    data = json.load(json_data)
    
label_row = [ 'user_id', 'reading_id', 'performance_id', 'unknown_count' ]
data_rows = []
data_rows.append(label_row)
for index, user in enumerate(data['users'], start=1):
    user_id = index
    for index, performance in enumerate(user['performances'], start=1):
        reading_id = index
        performance_id = performance['performance']['_id']
        unknown_count = len(performance['knonws'])
        data_row = [user_id, reading_id, performance_id, unknown_count]
        data_rows.append(data_row)
data_rows

# with open("recommandations.csv", "w") as f:
#     writer = csv.writer(f)
#     writer.writerows(data_rows)
