In [17]:
from sklearn.model_selection import ParameterGrid
from sklearn.svm import SVC

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import numpy as np
import eli5
import nltk
nltk.download('punkt')
from IPython.display import display
import numpy as np
from sklearn import linear_model
import pandas as pd


import features
import config
import data_loader
import os

seed_value = 42  # random seed of 42 for all experiments
os.environ['PYTHONHASHSEED'] = str(seed_value)
np.random.seed(seed_value)

[nltk_data] Downloading package punkt to /home/max/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def train_tfidf(train):
    vectorizer = TfidfVectorizer(ngram_range=(1, 1), analyzer='word', encoding='utf-8')
    train_sentences, test_sentences = [], []
    for x in train:
        train_sentences.append(features.get_words_str(x))
    X_train = vectorizer.fit_transform(train_sentences)
    return vectorizer, X_train


def make_predictions(piper_files, annotated_files, vectorizer, model):
    count = 0
    test_sentences = []
    filenames = []

    rootdir = '../../dataset/minNarrative_txtfiles/'
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if file not in piper_files: # Only look at non-annotated files
                filepath = os.path.join(subdir, file)
                filenames.append(filepath)
                count += 1
                with open(filepath) as file:
                    text = file.read().strip()
                test_sentences.append(' '.join(features.filter_punct(word_tokenize(text))))
    X_test = vectorizer.transform(test_sentences)

    print("Making predictions for {} documents".format(count))
#     preds_with_probs = model.predict_proba(X_test)[:, 1]
    preds = model.predict(X_test.toarray())
    preds[preds > 1.0] = 1.0
    preds[preds < 0.0] = 0
    return preds, filenames, X_test


def SVM_model(X_train, Y):
    tuned_parameters = [{'C': [1], 'kernel': ['linear']}, ]
    algo = SVC(probability=True)

    param_object = ParameterGrid(tuned_parameters)
    for param_dict in param_object:
        print("Running for parameters:", param_dict)
        algo.set_params(**param_dict)  # set the desired hyperparameters

        print("Training SVM TFIDF model with {} documents".format(Y.shape[0]))
        clf = algo.fit(X_train, Y)

        return clf

    
def Ridge_model(X_train, Y):
    tuned_parameters = [{}]
    algo = linear_model.Ridge()

    param_object = ParameterGrid(tuned_parameters)
    for param_dict in param_object:
        print("Running for parameters:", param_dict)
        algo.set_params(**param_dict)  # set the desired hyperparameters

        print("Training SVM TFIDF model with {} documents".format(Y.shape[0]))
        clf = algo.fit(X_train, Y)

        return clf

def TheilSenRegressor_model(X_train, Y):
    tuned_parameters = [{}]
    algo = linear_model.TheilSenRegressor(random_state=0, n_jobs=-1)
    
    param_object = ParameterGrid(tuned_parameters)
    for param_dict in param_object:
        print("Running for parameters:", param_dict)
        algo.set_params(**param_dict)  # set the desired hyperparameters

        print("Training TheilSenRegressor TFIDF model with {} documents".format(len(Y)))
        clf = algo.fit(X_train, Y)

        return clf

    
def average_probabilities(preds_with_probs, filenames):
    genre_probabilities = dict()
    for i in range(len(filenames)):
        genre = filenames[i].split('/')[-1].split('_')[0]
        if genre == 'FOX' or genre == 'CNN':
            genre = 'OPINION'

        if genre in genre_probabilities:
            genre_probabilities[genre].append(preds_with_probs[i])
        else:
            genre_probabilities[genre] = [preds_with_probs[i]]

    for k, v in genre_probabilities.items():
        avg = round(sum(v) / len(v), 3)
        print('Genre {}, {} documents, average predicted probability: {}'.format(k, len(v), avg))


In [3]:
# Loads annotated filenames and labels (334 files)
X, Y = data_loader.load_annotated_data(threshold=2.5)

# Change to MinNarrative_ReaderData_Final to make predictions for data of Piper and Bagga (2022)
annotated_df = pd.read_csv('../../dataset/Universal_Annotation_Results_Selection.csv')

piper_df = pd.read_csv('../../dataset/MinNarrative_ReaderData_Final.csv')
piper_files = piper_df[['FILENAME']].values
map_fname_score = dict(annotated_df[['FILENAME', 'avg_overall']].values)

# Rescale Reader Annotation Scores from 1 to 5, to 0 to 1.
Y = [((1 - 0) * (map_fname_score[fname] - 1) / (5 - 1)) for fname in X]


# Creates Tfidf vectorizer using the annotated files
vectorizer, X_train = train_tfidf(X)

# # Trains SVM model using the annotated files
# svm = SVM_model(X_train, Y)
model = TheilSenRegressor_model(X_train.toarray(), Y)


# Makes predictions on the rest of the data set (17k+ documents) using the SVM model and Tfidf Vectorizer
# preds_with_probs, filenames, X_test = make_predictions(X, vectorizer, svm)


preds_with_probs, filenames, X_test = make_predictions(piper_files, X, vectorizer, model)

Loading annotated data from: ../../dataset/Universal_Annotation_Results_Selection.csv
Running for parameters: {}
Training TheilSenRegressor TFIDF model with 325 documents
Making predictions for 17372 documents


In [4]:
print(len(filenames))

17372


In [5]:
import pickle

In [6]:
with open('PredsUniversalsTFIDF_TSR_17372docs.pickle', 'wb') as f:
    pickle.dump(preds_with_probs, f)


# Average predicted probability per genre

In [6]:
average_probabilities(preds_with_probs, filenames)

Genre SHORT, 488 documents, average predicted probability: 0.559
Genre OPINION, 1611 documents, average predicted probability: 0.338
Genre NOVEL19C, 1025 documents, average predicted probability: 0.502
Genre FLASH, 877 documents, average predicted probability: 0.535
Genre LEGAL, 1084 documents, average predicted probability: 0.06
Genre MIXED, 1000 documents, average predicted probability: 0.397
Genre BIO, 973 documents, average predicted probability: 0.426
Genre ACADEMIC-LITSTUDY, 528 documents, average predicted probability: 0.257
Genre ACADEMIC-SCIENCE, 967 documents, average predicted probability: 0.133
Genre APHORISM, 467 documents, average predicted probability: 0.271
Genre NOVEL-CONT, 951 documents, average predicted probability: 0.562
Genre FABLE, 263 documents, average predicted probability: 0.51
Genre ROC, 977 documents, average predicted probability: 0.502
Genre HIST, 1050 documents, average predicted probability: 0.372
Genre SCOTUS, 965 documents, average predicted probabili

# Feature Analysis

In [7]:
# Get corresponding feature names of Tfidf Vectorizer
feature_names = list(vectorizer.get_feature_names_out())

In [10]:
# Get index of passages with lowest and highest predicted degree of narrativty
highest_degree = np.where(preds_with_probs ==np.amax(preds_with_probs))[0][0]
lowest_degree = np.where(preds_with_probs ==np.amin(preds_with_probs))[0][0]
print(filenames[highest_degree])
print(filenames[lowest_degree])

../../dataset/minNarrative_txtfiles/reddit-stories/REDDIT_5S_bhbyx-143-story.txt
../../dataset/minNarrative_txtfiles/legal-contracts/LEGAL_5S_1_ExactSciencesCorp_20180822_8-K_EX-10-1_11331629_EX-10-1_Promotion-Agreement.txt


In [11]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [12]:
def show_weights(fname):
    with open(fname) as file:
        text = file.read().strip()
        text = ' '.join(word_tokenize(text))
        display(eli5.explain_prediction(model, text, vec=vectorizer, feature_names=feature_names))

print(preds_with_probs[lowest_degree])
print(filenames[lowest_degree])
show_weights(filenames[lowest_degree])

0.0
../../dataset/minNarrative_txtfiles/legal-contracts/LEGAL_5S_1_ExactSciencesCorp_20180822_8-K_EX-10-1_11331629_EX-10-1_Promotion-Agreement.txt


Contribution?,Feature
0.304,<BIAS>
-0.311,Highlighted in text (sum)


In [14]:
print(preds_with_probs[highest_degree])
print(filenames[highest_degree])
show_weights(filenames[highest_degree])

0.934672261483211
../../dataset/minNarrative_txtfiles/reddit-stories/REDDIT_5S_bhbyx-143-story.txt


Contribution?,Feature
0.631,Highlighted in text (sum)
0.304,<BIAS>


In [15]:
eli5.show_weights(model, feature_names=feature_names, top=(51,50))

Weight?,Feature
+0.466,out
+0.405,me
+0.378,was
+0.318,door
+0.304,<BIAS>
+0.295,different
+0.294,woman
+0.288,my
+0.283,plane
+0.268,while
