In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
from features import FeatureEmbeddings

import tqdm

# Data

In [2]:
df = pd.read_csv("data/snopes_phase2_clean_2018_7_3.csv")
subset = df.sample(2000)
embeddings = FeatureEmbeddings()
embeddings.create(subset,article_col='original_article_text_phase2',url_col='article_origin_url_phase1',header_col='article_title_phase2')

Building embeddings for headlines...
Building bigram model features for URL strings...


unable to cache TLDs in file /usr/local/lib/python3.5/dist-packages/tldextract/.tld_set: [Errno 13] Permission denied: '/usr/local/lib/python3.5/dist-packages/tldextract/.tld_set'


Calculating edit distance for each URL string...
Inferring article embeddings via doc2vec...


In [7]:
target = []
for c in subset['fact_rating_phase1']:
    if c == 'legend':
        target.append(0)
    elif c == 'mostly false':
        target.append(1)
    elif c == 'miscaptioned':
        target.append(2)
    elif c == 'outdated':
        target.append(3)
    elif c == 'false':
        target.append(4)
    elif c == 'mixture':
        target.append(5)
    elif c == 'mostly true':
        target.append(6)
    elif c == 'scam':
        target.append(7)
    elif c == 'correct attribution':
        target.append(8)
    elif c == 'misattributed':
        target.append(9)
    elif c == 'true':
        target.append(10)
    elif c == 'unproven':
        target.append(11)

In [16]:
embeddings.features['target'] = target
train_df = embeddings.features.iloc[:1500, :]
test_df = embeddings.features.iloc[1500:, :]

In [18]:
class Network(nn.Module):
    def __init__(self, in_dim, out_dim=2):
        super().__init__()
        
        self.hidden1 = nn.Linear(in_dim, 32)
        self.hidden2 = nn.Linear(32, 16)
        self.output = nn.Linear(16, out_dim)
        
        self.activation = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = self.hidden1(x)
        x = self.activation(x)
        x = self.hidden2(x)
        x = self.activation(x)
        x = self.output(x)
        x = self.softmax(x)
        
        return x

In [19]:
class FakeNewsDataset(Dataset):
    def __init__(self, df):
        self.data = df.drop(columns=['target'])
        self.targets = df['target'].astype(int)
    
    def __getitem__(self, i):
        x = torch.tensor(self.data.iloc[i])
        y = self.targets.iloc[i]
        return x, y
    
    def __len__(self):
        return len(self.data)

In [26]:
train_dataset = FakeNewsDataset(train_df)
val_dataset = FakeNewsDataset(test_df)

num_epochs = 150
batch_size = 30

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

model = Network(7, 2)

objective = torch.nn.CrossEntropyLoss() # loss function
optimizer = optim.Adam(model.parameters(), lr=1e-4)

TypeError: 'module' object is not callable

In [33]:
# Run your training / validation loops

train_losses_avgs = []
validate_losses_avgs = []

#train_loop = tqdm(total=len(train_loader) * num_epochs, position=0) # the little progress bar thing
#validate_loop = tqdm(total=len(val_loader) * num_epochs, position=0)

for epoch in range(num_epochs):
    
    train_losses = []
    
    for x, y_truth in train_loader:

        optimizer.zero_grad() # forget about the gradient you computed last time

        y_hat = model(x)
        loss = objective(y_hat, y_truth)

        train_losses.append(loss)
        
        #train_loop.set_description('Training loss: {:.4f}'.format(loss.item()))
        #train_loop.update(1)
        
        loss.backward() # computes the gradient and stores it in the variable

        optimizer.step()
    
    train_losses_avgs.append(sum(train_losses) / len(train_losses))
    
    validate_losses = []
    
    for x, y_truth in val_loader:
        x, y_truth = x.cuda(async=True), y_truth.cuda(async=True)

        y_hat = model(x)
        loss = objective(y_hat, y_truth)

        validate_losses.append(loss)
        
        #validate_loop.set_description('Validation loss: {:.4f}'.format(loss.item()))
        #validate_loop.update(1)
    
    validate_losses_avgs.append(sum(validate_losses) / len(validate_losses))
    
    
train_loop.close()
validate_loop.close()

RuntimeError: Expected object of scalar type Float but got scalar type Double for argument #2 'mat1' in call to _th_addmm

Notes today:

- make sure to normalize the features that aren't in article/header embedding space (so they don't overpower the other ones)
- retrain the doc2vec model with all the data you can find