In [5]:
import gensim
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import pandas as pd
import argparse
import pickle
from nltk.corpus import WordNetCorpusReader
from collections import Counter
import seaborn as sns
import os
import spacy

In [2]:
def get_one_hot(categories, category_labels):
    one_hot = {}
    for word in categories:
        categories_word_in = categories[word]
        if categories_word_in == "n/a":
            categories_word_in = ["n/a"]
        cats = []
        for category in category_labels:
            if category in categories_word_in:
                cats.append(1)
            else:
                cats.append(0)
        one_hot[word] = cats
    return one_hot

In [3]:
features = pickle.load(open("../data/processed/word_features.p", "rb"))

In [7]:
def separate_pos_sentiment(features):
    pos_list= ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPAC']
    entity_list = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'ORDINAL', 'CARDINAL']
    pos = {}
    sentiments = {}
    entities = {}
    
    # assign separately
    for f in tqdm(features.keys()):
        values = features[f]
        sentiments[f] = values[-4:]
        word = nlp(f)
        entities[f] = word[0].ent_type_
        
    for f in features.keys():
        entire = []
        for p in pos_list:
            if features[f][0] == p:
                entire.append(1)
            else:
                entire.append(0)
        pos[f] = entire
        
        entire = []
        for p in entity_list:
            if features[f][0] == p:
                entire.append(1)
            else:
                entire.append(0)
        entities[f] = entire
                
    # one hot part-of-speech
    return sentiments, pos, entities

In [9]:
nlp = spacy.load("en_core_web_sm")
sentiments, pos, entities = separate_pos_sentiment(features)

100%|██████████| 15000/15000 [02:52<00:00, 86.72it/s] 


In [10]:
pos_list= ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPAC']
entity_list = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'ORDINAL', 'CARDINAL']
sentiment_list = ['neg', 'neu', 'pos', 'compound']

In [11]:
pickle.dump( sentiments, open("../data/raw/one-hot-sentiments.p", "wb" ) )
pickle.dump( pos, open("../data/raw/one-hot-pos.p", "wb" ) )
pickle.dump( entities, open("../data/raw/one-hot-entities.p", "wb" ) )
pickle.dump( sentiment_list, open("../data/raw/one-hot-sentiments_labels.p", "wb" ) )
pickle.dump( pos_list, open("../data/raw/one-hot-pos_labels.p", "wb" ) )
pickle.dump( entity_list, open("../data/raw/one-hot-entities_labels.p", "wb" ) )

In [12]:
sentiments

{'the': [0.0, 1.0, 0.0, 0.0],
 'of': [0.0, 1.0, 0.0, 0.0],
 'to': [0.0, 1.0, 0.0, 0.0],
 'and': [0.0, 1.0, 0.0, 0.0],
 'in': [0.0, 1.0, 0.0, 0.0],
 'a': [0.0, 0.0, 0.0, 0.0],
 'for': [0.0, 1.0, 0.0, 0.0],
 'that': [0.0, 1.0, 0.0, 0.0],
 'on': [0.0, 1.0, 0.0, 0.0],
 'was': [0.0, 1.0, 0.0, 0.0],
 'said': [0.0, 1.0, 0.0, 0.0],
 'with': [0.0, 1.0, 0.0, 0.0],
 'he': [0.0, 1.0, 0.0, 0.0],
 'as': [0.0, 1.0, 0.0, 0.0],
 'it': [0.0, 1.0, 0.0, 0.0],
 'by': [0.0, 1.0, 0.0, 0.0],
 'at': [0.0, 1.0, 0.0, 0.0],
 'from': [0.0, 1.0, 0.0, 0.0],
 'his': [0.0, 1.0, 0.0, 0.0],
 'an': [0.0, 1.0, 0.0, 0.0],
 'be': [0.0, 1.0, 0.0, 0.0],
 'are': [0.0, 1.0, 0.0, 0.0],
 'have': [0.0, 1.0, 0.0, 0.0],
 'but': [0.0, 1.0, 0.0, 0.0],
 'were': [0.0, 1.0, 0.0, 0.0],
 'not': [0.0, 1.0, 0.0, 0.0],
 'this': [0.0, 1.0, 0.0, 0.0],
 'who': [0.0, 1.0, 0.0, 0.0],
 'they': [0.0, 1.0, 0.0, 0.0],
 'had': [0.0, 1.0, 0.0, 0.0],
 'i': [0.0, 0.0, 0.0, 0.0],
 'which': [0.0, 1.0, 0.0, 0.0],
 'will': [0.0, 1.0, 0.0, 0.0],
 'their': [0.0