# Generates training and test datasets (ChungusSets) 

In [1]:
""" Computes the full feature vectors based on the time_model, word2vec model, and subreddits
"""
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from word2vec import EpochSaver
import pymysql
from collections import defaultdict
import json
from torch.utils.data import Dataset
import torch
import numpy as np
import featurizer
import pickle



I have 64 cores


In [2]:
# load word2vec model
model = KeyedVectors.load('embeddings/full.model')
word = 'car'
print(model.wv.most_similar(word))

INFO - 06:29:28: loading KeyedVectors object from embeddings/full.model
INFO - 06:29:28: loading wv recursively from embeddings/full.model.wv.* with mmap=None
INFO - 06:29:28: loading vectors from embeddings/full.model.wv.vectors.npy with mmap=None
INFO - 06:29:28: loading syn1neg from embeddings/full.model.syn1neg.npy with mmap=None
INFO - 06:29:28: setting ignored attribute cum_table to None
INFO - 06:29:29: Word2Vec lifecycle event {'fname': 'embeddings/full.model', 'datetime': '2021-04-26T06:29:29.277355', 'gensim': '4.0.1', 'python': '3.6.9 (default, Jan 26 2021, 15:33:00) \n[GCC 8.4.0]', 'platform': 'Linux-5.4.0-1038-aws-x86_64-with-Ubuntu-18.04-bionic', 'event': 'loaded'}


[('cars', 0.7810503840446472), ('vehicle', 0.7047724723815918), ('truck', 0.649340033531189), ('driving', 0.6060368418693542), ('bike', 0.5890387296676636), ('motorcycle', 0.5614197850227356), ('suv', 0.5587404370307922), ('parked', 0.5542230606079102), ('drove', 0.550153911113739), ('driveway', 0.5349878668785095)]


In [2]:
data = featurizer.load_data()

INFO - 06:31:29: loading KeyedVectors object from embeddings/full.model
INFO - 06:31:29: loading wv recursively from embeddings/full.model.wv.* with mmap=None
INFO - 06:31:29: loading vectors from embeddings/full.model.wv.vectors.npy with mmap=None
INFO - 06:31:29: loading syn1neg from embeddings/full.model.syn1neg.npy with mmap=None
INFO - 06:31:29: setting ignored attribute cum_table to None
INFO - 06:31:30: Word2Vec lifecycle event {'fname': 'embeddings/full.model', 'datetime': '2021-04-26T06:31:30.121060', 'gensim': '4.0.1', 'python': '3.6.9 (default, Jan 26 2021, 15:33:00) \n[GCC 8.4.0]', 'platform': 'Linux-5.4.0-1038-aws-x86_64-with-Ubuntu-18.04-bionic', 'event': 'loaded'}




Processed 1000 entries
Processed 2000 entries
Processed 3000 entries
Processed 4000 entries
Processed 5000 entries
Processed 6000 entries
Processed 7000 entries
Processed 8000 entries
Processed 9000 entries


In [3]:
data = pickle.load(open("data_dump.p", "rb"))

In [29]:
N_WORDS = 5000

def data_to_dataset(data):
    # lists that will eventually be turned into tensors and into the dataset
    labels = []
    karmas = []
    mods = []
    words = []
    subs = []
    times = []
    
    count = 0
    for (k, v) in data.items():
        # these are all too short
        if(len(v['document']) < N_WORDS):
            continue
        
        labels.append(v['is_bot'])
        karmas.append(v['karma'])
        mods.append(v['is_mod'])
            
        words_tensor = torch.tensor(v['document'][:N_WORDS], dtype=torch.long).type(torch.LongTensor)
        words.append(words_tensor)
        
        sub_tensor = torch.tensor(v['subreddit_v'], dtype=torch.float)
        subs.append(sub_tensor)
        
        count += 1
        if(count % 10000 == 0):
            print("Processed %d entries" % count)
        
    
    labels = torch.tensor(labels, dtype=torch.long).type(torch.LongTensor)
    karmas = torch.tensor(karmas, dtype=torch.long).type(torch.LongTensor)
    mods = torch.tensor(mods, dtype=torch.long).type(torch.LongTensor)
    
    words = torch.stack(words, dim=0).type(torch.LongTensor)
    subs = torch.stack(subs,dim=0)
    
    return featurizer.ChungusSet(words, subs, karmas, mods, labels)

In [30]:
dataset = data_to_dataset(data)

In [32]:
print(dataset[0])

(tensor([   23,     5,  4523,  ..., 53217,    90,   673]), tensor([0.1280, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]), tensor(13189), tensor(1), tensor(1))


In [33]:
# split dataset
N_TRAIN = int(4/5 * len(dataset))
N_TEST = len(dataset) - N_TRAIN
torch.manual_seed(0)

datasets = torch.utils.data.random_split(dataset, [N_TRAIN, N_TEST])
train_data = datasets[0]
test_data = datasets[1]

In [37]:
# save datasets
pickle.dump(train_data, open("../model/train_set.p", "wb"),  protocol=4)
pickle.dump(test_data, open("../model/test_set.p", "wb"),  protocol=4)