In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as torch_data

In [2]:
import numpy as np
import json
import re

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
stop_words.add('')

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anugrahchemparathy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anugrahchemparathy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anugrahchemparathy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/anugrahchemparathy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
from spellchecker import SpellChecker
spell = SpellChecker()
spell.word_frequency.add('obama')
spell.word_frequency.add('blm')
spell.word_frequency.add('killing')

In [6]:
import sys
  
# setting path
sys.path.append('../parentdirectory')

from ... import process_data

ImportError: attempted relative import with no known parent package

In [None]:
"""
Cleans the dataset and returns the 

@param file_lines: list of lines in the input file where each line contains all the information for a given comment (content + title + author title + etc.)

@returns [labels, comment_list, title_list, max_len, max_title_len]
    labels: file
    comment_list: list of all comments in the file
    title_list: list of all titles in the file
    max_comment_len: length of the longest comment in the dataset
    max_title_len: length of the longest title in the dataset
"""
def clean(file_lines):
    max_len = 0
    max_title_len = 0  
    comment_list = []
    title_list = []
    label = []
    for line in file_lines:
        comment = json.loads(line)
        
        t = comment['text']
        t = ' '.join([x for x in t.split() if x[0] != '@'])
        t = ' '.join(re.findall("[a-zA-Z,.]+",t))
        t = t.replace(',', ' ')
        t = t.replace('.', ' ')
        text = word_tokenize(t)
        text = [x for x in text if x.lower() not in stop_words]
        max_len = max(max_len, len(text))
        comment_list.append(text)
        
        title = comment['title']
        title = title.replace(',', '')
        title = title.replace('.', '')
        title = re.findall("[a-zA-Z,.]+",title)
        title_list.append(title)
        max_title_len = max(max_title_len, len(title))
        
        label.append(comment['label'])
    
    labels = np.array(label)
    return labels, comment_list, title_list, max_len, max_title_len


"""
Returns word2vec embeddings for an input word string

@param word : a string
@param embed : the embedding keyed vectors (in our case word2vec)
@returns : the (300,0) embedding for word
"""
def get_embed(word, embed):
    x = np.zeros((300,)) # default value should be 0
    corrected = spell.correction(word) # closest correction
    if word in embed: # base word
        x = embed[word]
    elif word.upper() in embed: # capitalized (edge case for acronyms like BLM) (for some reason blm doesn't exist but BLM does?)
        x = embed[word.upper()]
    elif word.lower() in embed: # opposite of capitalization
        x = embed[word.lower()]
    elif corrected in embed: # last case, check if closest correction exists (might be bad, some corrections are kinda ass)
        x = embed[corrected]
    
    return x

"""
Converts the lists for comments, titles into ndarrays

@params : straightforward
@returns: [comment_array,title_array] list of ndarrays for comments and titles
"""
def to_array(embed, comments, titles, max_comment_len, max_title_len):
    comment_array = np.zeros((len(comments), max_comment_len, 300))
    title_array = np.zeros((len(titles), max_title_len, 300))
    for ix1, sent in enumerate(comments):
        for ix2, word in enumerate(sent):
            comment_array[ix1,ix2] = get_embed(word,embed)
    for ix1, title in enumerate(titles):
        for ix2, word in enumerate(title):
            title_array[ix1,ix2] = get_embed(word,embed)
    
    return comment_array, title_array

In [None]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim

from tqdm import tqdm
import gensim
import gensim.downloader as api


from datasets import *
from models import *
from process_data import *



path = api.load("word2vec-google-news-300", return_path=True)
#print(path) #/root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz

embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

train_lines = open("./Data/fox-news-comments.json", "r").readlines() #original 2015 data
test_lines = open("./Data/modern_comments.json", "r").readlines() #modern data

train_labels, train_comments, train_titles, train_max_len, train_max_title_len = clean(train_lines)
test_labels, test_comments, test_titles, test_max_len, test_max_title_len = clean(test_lines)

train_comment_array, train_title_array = to_array(embed, train_comments, train_titles, train_max_len, train_max_title_len)
test_comment_array, test_title_array = to_array(embed, test_comments, test_titles, test_max_len, test_max_title_len)

train_comment_array,train_title_array,train_labels = custom_shuffle(train_comment_array,train_title_array,train_labels)
test_comment_array, test_title_array, test_labels = custom_shuffle(test_comment_array, test_title_array, test_labels)

train_comment_array = np.float32(train_comment_array)
train_title_array = np.float32(train_title_array)

test_comment_array = np.float32(test_comment_array)
test_title_array = np.float32(test_title_array)