# Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import spacy
from spacy.attrs import ORTH
import textacy
import pickle
from collections import defaultdict

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


# Read in raw data

In [2]:
raw_data = pd.read_excel("long_answer/training_set_rel3.xls")

# Load spacy model

In [3]:
nlp=spacy.load('en_core_web_lg',disable=['ner'])

# Apply spacy model to each essay

In [19]:
raw_data['doc'] = raw_data.essay.apply(lambda essay: nlp(essay.lower()))

# Define functions to extract features

In [20]:
def tree_height(root):

    if not list(root.children):
        return 1
    else:
        return 1 + max(tree_height(x) for x in root.children)
    
def get_average_heights(paragraph):

    if type(paragraph) == str:
        doc = nlp(paragraph)
    else:
        doc = paragraph
    roots = [sent.root for sent in doc.sents]
    return np.mean([tree_height(root) for root in roots])

def get_variance_heights(paragraph):

    if type(paragraph) == str:
        doc = nlp(paragraph)
    else:
        doc = paragraph
    roots = [sent.root for sent in doc.sents]
    return np.std([tree_height(root) for root in roots])

def get_tree_heights(paragraph):
    if type(paragraph) == str:
        doc = nlp(paragraph)
    else:
        doc = paragraph
    roots = [sent.root for sent in doc.sents]
    return [tree_height(root) for root in roots]

def get_sentences(doc):
    sents = list(doc.sents)
    return sents

def get_sentence_count(sentences):
    return float(len(sentences))

def get_word_counts(doc):
    return doc.count_by(ORTH)

def get_connectives(doc):
    text = doc.text.lower()
    connectives = [
    'after',
    'earlier',
    'before',
    'during',
    'while',
    'later',
    'because',
    'consequently',
    'thus',
    'both',
    'additionally',
    'furthermore',
    'moreover',
    'actually',
    'as a result',
    'due to',
    'but',
    'yet',
    'however',
    'although',
    'nevertheless'
    ]
    total = 0
    for connector in connectives:
        total += text.count(connector)
    return float((total/len(doc)))

def get_pos(doc):
    return [token.pos_ for token in doc]


def get_posngrams(poslist,n):
    posngrams = []
    for item in range(len(poslist) - n + 1):
        posngrams.append(tuple([poslist[item+i] for i in range(n)]))
    return posngrams

def get_posgrams_counts(list_grams):
    posgrams_counts = defaultdict(int)
    for gram in list_grams:
        posgrams_counts[gram] += 1
    return posgrams_counts

def get_TF(list_dicts):
    TF_dict = defaultdict(int)
    for dictionary in list_dicts:
        for gram in dictionary:
            TF_dict[gram] += dictionary[gram]
    return TF_dict

def get_mean_tfTF(posgram_counts,TF):
    tfTF_ratios = list()
    for key, value in posgram_counts.items():
        tfTF_ratios.append(value/TF[key])
    return np.mean(tfTF_ratios)

def get_posngram_ratio(posngrams):
    if len(posngrams) > 0:
        return float(len(set(posngrams))/len(posngrams))
    else:
        return 0

def get_reading_scores(doc):
    scores = textacy.TextStats(doc).readability_stats
    del scores['smog_index']
    return scores

def get_word_lengths(doc):
    lengths = list()
    for word in doc:
        if word.is_alpha:
            lengths.append(float(len(word)))
    return lengths

def get_words_of_length(lengths, n, p):
    count = 0
    for length in lengths:
        if length > n and length < p:
            count += 1
    return float(count)

def get_similarity_scores(doc):
    sents = [sent for sent in doc.sents]
    similarity_scores = list()
    for i in range(1,len(sents)):
        sent1 = sents[i-1]
        sent2 = sents[i]
        similarity_scores.append(sent1.similarity(sent2))
    return np.mean(similarity_scores)

def nth_root(x,n):
    return x ** (1/float(n))

def get_yules_k(word_counts):
    m1 =  sum(word_counts.values())
    m2 = sum([freq ** 2 for freq in word_counts.values()])
    if m1 == m2:
        k = 0 
    else:
        i = (m1*m1) / (m2-m1)
        k = 1/i * 10000
    if np.isnan(k):
        k=10000
    try:
        return float(k)
    except ZeroDivisionError:
        return 10000

# Feature preengineering

In [21]:
# Preengineering
raw_data['sentences'] = raw_data.doc.apply(get_sentences)
raw_data['word_counts'] = raw_data.doc.apply(get_word_counts)
raw_data['word_lengths'] = raw_data.doc.apply(get_word_lengths)
raw_data['pos'] = raw_data.doc.apply(get_pos)

raw_data['pos_trigrams'] = raw_data.pos.apply(lambda pos: get_posngrams(pos, n=3))
raw_data['pos_fourgrams'] = raw_data.pos.apply(lambda pos: get_posngrams(pos, n=4))
raw_data['pos_trigram_counts'] = raw_data.pos_trigrams.apply(get_posgrams_counts)
raw_data['pos_fourgram_counts'] = raw_data.pos_fourgrams.apply(get_posgrams_counts)
tri_pos_TF = get_TF(raw_data.pos_trigram_counts)
four_pos_TF = get_TF(raw_data.pos_fourgram_counts)

raw_data['tree_heights'] = raw_data.doc.apply(lambda doc: get_tree_heights(doc))
import warnings; warnings.simplefilter('ignore')
raw_data['reading_scores'] = raw_data.doc.apply(get_reading_scores)

# Feature engineering

In [22]:
# Lexical Features
raw_data['words_length_4'] = raw_data.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 4,6))
raw_data['words_length_6'] = raw_data.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 6,8))
raw_data['words_length_8'] = raw_data.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 8,10))
raw_data['words_length_10'] = raw_data.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 10,12))
raw_data['words_length_12'] = raw_data.word_lengths.apply(lambda lengths: get_words_of_length(lengths, 12,100))
raw_data['mean_word_length'] = raw_data.word_lengths.apply(np.mean)
raw_data['variance_word_length'] = raw_data.word_lengths.apply(np.std)


# Length Features
raw_data['essay_length'] = raw_data.doc.apply(len)
raw_data['num_words'] = raw_data.doc.apply(lambda doc: float(len([word for word in doc if word.is_alpha])))
raw_data['num_sentences'] = raw_data.sentences.apply(get_sentence_count)
raw_data['mean_sentence_length'] = raw_data.num_words/raw_data.num_sentences
raw_data['num_characters'] = raw_data.essay.apply(len)
raw_data['fourth_root_num_characters'] = raw_data.num_characters.apply(nth_root, n=4)

# # Occurrence Features
raw_data['num_commas'] = raw_data.essay.apply(lambda essay: float(essay.count(',')))
raw_data['num_periods'] = raw_data.essay.apply(lambda essay: float(essay.count('.')))
raw_data['num_exclaim'] = raw_data.essay.apply(lambda essay: float(essay.count('!')))
raw_data['num_question'] = raw_data.essay.apply(lambda essay: float(essay.count('?')))
raw_data['num_semicolon'] = raw_data.essay.apply(lambda essay: float(essay.count(';')))
raw_data['num_colon'] = raw_data.essay.apply(lambda essay: float(essay.count(':')))

# # Style Features
# FIX raw_data['vocabulary'] = raw_data.word_tokens.apply(lambda word_tokens: set(word.lower() for word in word_tokens if word.isalpha()))
raw_data['vocab_size'] = raw_data.word_counts.apply(len)
raw_data['type_token_ratio'] = raw_data.word_counts.apply(len) / raw_data.essay_length
raw_data['yules_k'] = raw_data.word_counts.apply(get_yules_k)

# # Syntactical Features
# # the number for these lengths comes from Chen and He 2013
raw_data['sentence_lengths'] = raw_data.sentences.apply(lambda sentences: [len(sent) for sent in sentences])
raw_data['very_short_sentences'] = raw_data.sentence_lengths.apply(lambda sentence_lengths: float(sum([length <= 10 for length in sentence_lengths])))
raw_data['short_sentences'] = raw_data.sentence_lengths.apply(lambda sentence_lengths: float(sum([length > 10 and length <18 for length in sentence_lengths])))
raw_data['medium_sentences'] = raw_data.sentence_lengths.apply(lambda sentence_lengths: float(sum([length > 18 and length <25 for length in sentence_lengths])))
raw_data['long_sentences'] = raw_data.sentence_lengths.apply(lambda sentence_lengths: float(sum([length > 25 for length in sentence_lengths])))
raw_data['variance_sentence_length'] = raw_data.sentence_lengths.apply(lambda sentence_lengths: np.std(sentence_lengths))

raw_data['max_height'] = raw_data.tree_heights.apply(lambda heights: float(max(heights)))
raw_data['sum_heights'] = raw_data.tree_heights.apply(sum)
raw_data['mean_heights'] = raw_data.tree_heights.apply(np.mean)

# raw_data['mean_sentence_similarity'] = raw_data.doc.apply(get_similarity_scores)

# # POS Ngrams
raw_data['pos_trigram_ratio'] = raw_data.pos_trigrams.apply(get_posngram_ratio)
raw_data['pos_fourgram_ratio'] = raw_data.pos_fourgrams.apply(get_posngram_ratio)
raw_data['mean_trigram_tfTF'] = raw_data.pos_trigram_counts.apply(lambda pos_trigram_counts: get_mean_tfTF(pos_trigram_counts, TF=tri_pos_TF))
raw_data['mean_fourgram_tfTF'] = raw_data.pos_fourgram_counts.apply(lambda pos_fourgram_counts: get_mean_tfTF(pos_fourgram_counts, TF=four_pos_TF))

# # Cohesion Features
raw_data['connectives'] = raw_data.doc.apply(get_connectives)

# Readability Features
raw_data['flesch_kincaid_grade_level'] = raw_data.reading_scores.apply(lambda score_dict:score_dict['flesch_kincaid_grade_level'])
raw_data['flesch_reading_ease'] = raw_data.reading_scores.apply(lambda score_dict:score_dict['flesch_reading_ease'])
raw_data['gunning_fog_index'] = raw_data.reading_scores.apply(lambda score_dict:score_dict['gunning_fog_index'])
raw_data['coleman_liau_index'] = raw_data.reading_scores.apply(lambda score_dict:score_dict['coleman_liau_index'])
raw_data['automated_readability_index'] = raw_data.reading_scores.apply(lambda score_dict:score_dict['automated_readability_index'])
raw_data['lix'] = raw_data.reading_scores.apply(lambda score_dict:score_dict['lix'])
raw_data['gulpease_index'] = raw_data.reading_scores.apply(lambda score_dict:score_dict['gulpease_index'])
raw_data['wiener_sachtextformel'] = raw_data.reading_scores.apply(lambda score_dict:score_dict['wiener_sachtextformel'])

# Assign correct score scale to each essay

In [23]:
DivSeries = pd.DataFrame({'div': [12,5,3,3,4,4,25,50],'essay_set':[1,2,3,4,5,6,7,8]})
eng_data = raw_data.merge(DivSeries, on='essay_set')
eng_data['score'] = eng_data.domain1_score/eng_data['div']

# Create and pickle training data dataframe

In [24]:
trainingdata = eng_data.iloc[:,37:]
trainingdata = trainingdata.drop(['sentence_lengths','div','reading_scores','tree_heights'], axis=1)
trainingdata['essay_set'] = raw_data.essay_set
trainingdata['essay_id'] = raw_data.essay_id
trainingdata.to_pickle("./engineered_data.pkl")