# Supervised Learning Text Complexity Classifier - Predict

## Prediction notes

This version of the notebook uses the test data to make predictions from the model generated in the dev notebook. It requires six pickles to be in the same directory and accepts one argument sys[0] which should be the test data.


## Data import and cleaning

Primary goals here are getting data from the training data set and creating metrics that will convey the complexity of the text to our classifiers. There are 3 additional data sources as part of the Kaggle set that we used. The average of acquisition data set contains information gathered on around 50,000 words and contains each words lemmatized root and information about when that word the average age a person learns that word and the frequency of its use. The concreteness ratings contains a smaller number of words, but gives an impression of how much a word is associated with a particular idea. Finally the dale_chall data set contains a list of words that are considered 'basic english'.


#### Dependencies

Below are the datacleaning dependencies only. A longer list of imports for modeling is at the start of that section.

In [2]:
import pandas as pd
import numpy as np
import re
import statistics as stats
import pickle as pkl
import sys
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from gensim.models.word2vec import Word2Vec


In [3]:
#df = pd.read_csv(sys.argv[1])
df = pd.read_csv('WikiLarge_Test.csv')
def predict(df):
    def tokenize_and_remove_stops(text):
        text_list = word_tokenize(text)
        stop_word_set = set(stopwords.words('english'))
        clean_list = [word.lower() for word in text_list if word.lower() not in stop_word_set]
        clean_list = [word for word in clean_list if re.match('^[a-z]+$', word)]

        return clean_list
    
    def syllable_count(clean_text_list):
        count = 0
        for word in clean_text_list:
            word = word.lower()
            vowels = "aeiouy"
            if word[0] in vowels:
                count += 1
            for index in range(1, len(word)):
                if word[index] in vowels and word[index - 1] not in vowels:
                    count += 1
            if word.endswith("e"):
                count -= 1
            if count == 0:
                count += 1
        return count
    
    
    def flesch_kincaid_ease(row):
        return round(206.835 - 1.015*(row['word_count']) - 84.6*(row['avg_syll_per_word']),2)
    
    def get_mean_conc(text):
        sum_conc = 0
        word_count = max(len(text), 1)
        for word in text:
            try:
                sum_conc += conc_dict[word]['Conc.M']
            except:
                sum_conc += mean_conc

        return round(sum_conc / word_count, 4)

    def get_mean_aoa(text):
        sum_aoa = 0
        word_count = max(len(text), 1)
        for word in text:
            try:
                sum_aoa += age_dict[word]['aoa']
            except:
                sum_aoa += mean_aoa

        return round(sum_aoa / word_count, 4)

    def get_mean_perc(text):
        sum_perc = 0
        word_count = max(len(text), 1)
        for word in text:
            try:
                sum_perc += age_dict[word]['perc_known']
            except:
                try:
                    sum_perc += conc_dict[word]['Percent_known']
                except:
                    sum_perc += mean_perc_known

        return round(sum_perc / word_count, 4)

    def get_mean_freq(text):
        list_freq = []
        for word in text:
            try:
                list_freq.append(age_dict[word]['freq'])
            except:
                list_freq.append(med_freq)

        if not list_freq:
            list_freq.append(0)

        return stats.median(list_freq)

    def count_non_basic(text):
        count = 0
        for word in text:
            try:
                count += age_dict[word]['non_basic']
            except:
                try:
                    count += conc_dict[word]['non_basic']
                except:
                    count += 1
        return count
    
    def lem_combine(text_list):
        lem = WordNetLemmatizer()
        lem_word = []
        for word in text_list:
            lem_word.append(lem.lemmatize(word))

        return (' '.join(word for word in lem_word))
    
    def document_vector(text):
        doc = [word for word in text.split() if word in w2v.wv.vocab]
        if len(doc) == 0:
            doc.append('he')
        return np.mean(w2v[doc])
    
    df['clean_text'] = df['original_text'].apply(tokenize_and_remove_stops)

    df['syllables'] = df['clean_text'].apply(syllable_count)
    df['word_count'] = df['clean_text'].apply(lambda x: len(x))
    df['avg_syll_per_word'] = df['syllables'] / df['word_count']
    df['avg_syll_per_word'] = df['avg_syll_per_word'].fillna(0)

    
    df['fc_ease'] = df.apply(flesch_kincaid_ease, axis = 1)

    fc_mean = df['fc_ease'].dropna().mean()
    df['fc_ease'] = df['fc_ease'].fillna(fc_mean)
    
    with open('concrete.pkl', 'rb') as handle:
        conc_dict = pkl.load(handle) 
    
     
    
    with open('aoa.pkl', 'rb') as handle:
        age_dict = pkl.load(handle)
        
    with open('misc.pkl', 'rb') as handle:
        misc_dict = pkl.load(handle)
    
    mean_conc = misc_dict['mean_conc']
    mean_aoa = misc_dict['mean_aoa']
    mean_perc_known = misc_dict['mean_perc_known']
    med_freq = misc_dict['med_freq']
        
    df['mean_conc'] = df['clean_text'].apply(get_mean_conc)
    df['mean_aoa'] = df['clean_text'].apply(get_mean_aoa)
    df['mean_perc_known'] = df['clean_text'].apply(get_mean_perc)
    df['mean_freq'] = df['clean_text'].apply(get_mean_freq)
    df['non_basic_words'] = df['clean_text'].apply(count_non_basic)
    df = df.fillna(0)
    
    df['lem_text'] = df['clean_text'].apply(lem_combine)
    
    with open('vectorizer.pkl', 'rb') as handle:
        vectorizer = pkl.load(handle)
        
    with open('log_reg.pkl', 'rb') as handle:
        log_vec = pkl.load(handle)
        
    X_vec = vectorizer.transform(df['lem_text'])
    
    df['logreg_prob'] = [num[0] for num in log_vec.predict_proba(X_vec)]
    
    with open('w2v.pkl', 'rb') as handle:
        w2v = pkl.load(handle)
        
    df['w2v'] = df.lem_text.apply(document_vector)
    
    X_test = df[['word_count', 'avg_syll_per_word', 'fc_ease', 'mean_conc', 'mean_aoa', 'mean_perc_known', 'mean_freq', 'non_basic_words', 'logreg_prob', 'w2v']]
    
    scaler = StandardScaler()
    scaler.fit(X_test)
    X_test_scaled = scaler.transform(X_test)
    
    with open('knn.pkl', 'rb') as handle:
        knn_clf = pkl.load(handle)
    
    df['label'] = knn_clf.predict(X_test_scaled)
    df['id'] = df.index
    
    df_return = df[['id', 'label']]
    
    return df_return

In [5]:
df_test = predict(df)
df_test.to_csv('predictions.csv', index = False)

