# Generates training and test datasets (ChungusSets) 

In [54]:
""" Computes the full feature vectors based on the time_model, word2vec model, and subreddits
"""
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from word2vec import EpochSaver
import pymysql
from collections import defaultdict
import json
from torch.utils.data import Dataset
import torch
import numpy as np
from featurization import featurizer
import pickle

ModuleNotFoundError: No module named 'featurization'

In [None]:
# load word2vec model
model = KeyedVectors.load('../models/full.model')
word = 'bob'
print(word in model.wv.vocab)
print(model.wv.vocab.get(word).index)

In [2]:
states = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", "Connecticut", "District of Columbia", "Delaware", "Florida", "Georgia", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]


In [2]:
data = featurizer.load_data()

INFO - 09:37:45: loading Word2VecKeyedVectors object from ../models/full.model
INFO - 09:37:46: loading wv recursively from ../models/full.model.wv.* with mmap=None
INFO - 09:37:46: loading vectors from ../models/full.model.wv.vectors.npy with mmap=None
INFO - 09:37:46: setting ignored attribute vectors_norm to None
INFO - 09:37:46: loading vocabulary recursively from ../models/full.model.vocabulary.* with mmap=None
INFO - 09:37:46: loading trainables recursively from ../models/full.model.trainables.* with mmap=None
INFO - 09:37:46: loading syn1neg from ../models/full.model.trainables.syn1neg.npy with mmap=None
INFO - 09:37:46: setting ignored attribute cum_table to None
INFO - 09:37:46: loaded ../models/full.model


Processed 1000 entries
Processed 2000 entries
Processed 3000 entries
Processed 4000 entries
Processed 5000 entries
Processed 6000 entries
Processed 7000 entries
Processed 8000 entries
Processed 9000 entries
Processed 10000 entries
Processed 11000 entries
Processed 12000 entries
Processed 13000 entries
Processed 14000 entries
Processed 15000 entries
Processed 16000 entries
Processed 17000 entries
Processed 18000 entries
Processed 19000 entries
Processed 20000 entries
Processed 21000 entries
Processed 22000 entries
Processed 23000 entries
Processed 24000 entries
Processed 25000 entries
Processed 26000 entries
Processed 27000 entries
Processed 28000 entries
Processed 29000 entries
Processed 30000 entries
Processed 31000 entries
Processed 32000 entries
Processed 33000 entries
Processed 34000 entries
Processed 35000 entries
Processed 36000 entries
Processed 37000 entries
Processed 38000 entries
Processed 39000 entries
Processed 40000 entries
Processed 41000 entries
Processed 42000 entries
P

In [3]:
data = pickle.load(open("data_dump.p", "rb"))

In [65]:
N_WORDS = 10000

def data_to_dataset(data):
    # lists that will eventually be turned into tensors and into the dataset
    labels = []
    words = []
    subs = []
    times = []
    
    count = 0
    for (k, v) in data.items():
        # these are all foreigners or too short
        if('time_v' not in v or v['location'] not in states or len(v['document']) < N_WORDS):
            continue
        
        labels.append(states.index(v['location']))
            
        words_tensor = torch.tensor(v['document'][:N_WORDS], dtype=torch.long).type(torch.LongTensor)
        words.append(words_tensor)
        
        sub_tensor = torch.tensor(v['subreddit_v'], dtype=torch.float)
        subs.append(sub_tensor)
        
        times_tensor = torch.tensor(v['time_v'], dtype=torch.float)
        times.append(times_tensor)
        
        count += 1
        if(count % 10000 == 0):
            print("Processed %d entries" % count)
        
    
    labels = torch.tensor(labels, dtype=torch.long).type(torch.LongTensor)
    words = torch.stack(words, dim=0).type(torch.LongTensor)
    subs = torch.stack(subs,dim=0)
    times = torch.stack(times, dim=0)
    
    return featurizer.ChungusSet(words, subs, times, labels)

In [66]:
dataset = data_to_dataset(data)

Processed 10000 entries
Processed 20000 entries
Processed 30000 entries
Processed 40000 entries
Processed 50000 entries


In [67]:
# split dataset
N_TRAIN = int(4/5 * len(dataset))
N_TEST = len(dataset) - N_TRAIN
torch.manual_seed(0)

datasets = torch.utils.data.random_split(dataset, [N_TRAIN, N_TEST])
train_data = datasets[0]
test_data = datasets[1]

In [69]:
# save datasets
pickle.dump(train_data, open("../data/train_set.p", "wb"),  protocol=4)
pickle.dump(test_data, open("../data/test_set.p", "wb"),  protocol=4)

In [71]:
print(dataset[0][0].dtype)

torch.int64
