In [1]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../preprocessing/') #need this in order to get to the other file in other directory

#can comment out the ones you aren't using to save a little bit of time
from covidPreprocess import getCoronaVocabulary, getCoronaText, getCoronaVocabulary2
from liarPreprocess import getLiarVocabulary, getLiarText
from fnnPreprocess import getFNNVocabulary, getFNNText
# from fnnCovidCombinedPreprocess import getFNNCoronaVocabulary, getFNNCoronaText

In [2]:
class SimpleNeuralNet(nn.Module):
    # Simple Feed Forward Neural Network with One Hidden Layer that Outputs One Neuron (Binary Classification, can't handle more than 2 classes)
    
    def __init__(self, input_size, hidden_size):
        super(SimpleNeuralNet, self).__init__()
        #Written based off of the tutorial at
        #https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/01-basics/feedforward_neural_network/main.py#L37-L49
        self.hidden1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()   
        self.oupt = nn.Linear(hidden_size, 1)  

    def forward(self, x):
        out = torch.tanh(self.hidden1(x))
        out = torch.sigmoid(self.oupt(out))
        return out

In [3]:
def trainAndTestSimpleModelAndGetProbs(dataset: str, num_epochs = 5, learning_rate = 0.001, print_epoch_mod = 5):
    '''
    Used this article for help in writing the tensor parts of code so it works with the model
    https://medium.com/analytics-vidhya/part-1-sentiment-analysis-in-pytorch-82b35edb40b8
    
    Train and tests, calculates both training and test accuracy, models that use SimpleNeuralNet.
    Returns train accuracy, test accuracy, trained model, and vectorizer for training data.
    '''
    torch.manual_seed(1)
    if dataset == 'corona':
        X,Y = getCoronaText() #this function will give us the text array (not document term matrix) and Y
        X_train,Y_train, vectorizer_train = getCoronaVocabulary(True)
    elif dataset == 'liar':
        X,Y = getLiarText()
        X_train,Y_train, vectorizer_train = getLiarVocabulary(True)
    elif dataset == 'fnn':
        X,Y = getFNNText()
        X_train,Y_train, vectorizer_train = getFNNVocabulary(True)
    elif dataset == 'combined':
        X,Y = getFNNCoronaText()
        X_train,Y_train, vectorizer_train = getFNNCoronaVocabulary(True)
    
    #transform our testing dataset to match the vocabulary for the training dataset
    #transform will return the document-term matrix for X based on training dataset
    x_test = vectorizer_train.transform(X)
    
    #sample test on logistic classifier
    '''classifier = LogisticRegression()
    classifier.fit(X_train,Y_train)
    score = classifier.score(x_test,Y)
    print(score)'''
    
    vocabsize = X_train.shape[1]
    
    
    #transform our training and test data into tensors for the classifier to learn off of
    X_tensor = torch.from_numpy(X_train.todense()).float()
    Y_tensor = torch.from_numpy(np.array(Y_train)).float()
    
    X_test_tensor = torch.from_numpy(x_test.todense()).float()
    Y_test_tensor = torch.from_numpy(np.array(Y))
    
    device = torch.device('cpu')
    #use TensorDataset to be able to use our DataLoader
    train_data = torch.utils.data.TensorDataset(X_tensor, Y_tensor)
    train_loader = torch.utils.data.DataLoader(train_data,batch_size=16, shuffle=False)
    train_loader_batch_size_1 = torch.utils.data.DataLoader(train_data,batch_size=1, shuffle=False)
    
    test_data = torch.utils.data.TensorDataset(X_test_tensor, Y_test_tensor)
    test_loader = torch.utils.data.DataLoader(test_data,batch_size=1, shuffle=False)
    
    #initialize our model
    model = SimpleNeuralNet(vocabsize, 200).to(device)
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
    
    
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        for i, (x_batch, labels) in enumerate(train_loader):
    
            # Forward pass
            # The forward process computes the loss of each iteration on each sample
            model.train()
            y_pred = model(x_batch)
            loss = loss_fn(y_pred, labels.reshape(-1, 1))
    
            # Backward pass, using the optimizer to update the parameters
            optimizer.zero_grad()
            loss.backward()    #compute gradients
            optimizer.step()   #initiate gradient descent
    
     
            # Below, an epoch corresponds to one pass through all of the samples.
            # Each training step corresponds to a parameter update using 
            # a gradient computed on a minibatch of 100 samples 
            if (i + 1) % print_epoch_mod == 0: 
                #leaving it on 5 for corona dataset, probably want to change to % 50 or % 100
                # for the other datasets so don't get spammed 
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                      .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
    
    # Test the model
    # In the test phase, we don't need to compute gradients (the model has already been learned)
    prob_dict = dict()
    with torch.no_grad():
        correct = 0
        total = 0
        for inputs, label in test_loader:
            output = model(inputs)
            total += 1
            prob_dict[inputs] = (label, output)
            if label >= 0.5 and output >= 0.5:
                correct += 1
            elif label < 0.5 and output < 0.5:
                correct += 1
            
        print('Test accuracy of the network: {} %'.format(100 * correct / total))
        test_accuracy = 100 * correct / total
        
    # Print out training accuracy
    with torch.no_grad():
        correct = 0
        total = 0
        for inputs, label in train_loader_batch_size_1:
            output = model(inputs)
            total += 1
            if label >= 0.5 and output >= 0.5:
                correct += 1
            elif label < 0.5 and output < 0.5:
                correct += 1
                
        print('Train accuracy of the network: {} %'.format(100 * correct / total))
        train_accuracy = 100 * correct / total
    
    return test_accuracy, train_accuracy, model, vectorizer_train

In [4]:
test_accuracy, train_accuracy, model, vectorizer_train = trainAndTestSimpleModelAndGetProbs('corona', num_epochs=50)

Epoch [1/50], Step [5/19], Loss: 0.3543
Epoch [1/50], Step [10/19], Loss: 0.5111
Epoch [1/50], Step [15/19], Loss: 0.2169
Epoch [2/50], Step [5/19], Loss: 0.0395
Epoch [2/50], Step [10/19], Loss: 0.1832
Epoch [2/50], Step [15/19], Loss: 0.0935
Epoch [3/50], Step [5/19], Loss: 0.0094
Epoch [3/50], Step [10/19], Loss: 0.0684
Epoch [3/50], Step [15/19], Loss: 0.0311
Epoch [4/50], Step [5/19], Loss: 0.0038
Epoch [4/50], Step [10/19], Loss: 0.0362
Epoch [4/50], Step [15/19], Loss: 0.0157
Epoch [5/50], Step [5/19], Loss: 0.0021
Epoch [5/50], Step [10/19], Loss: 0.0217
Epoch [5/50], Step [15/19], Loss: 0.0092
Epoch [6/50], Step [5/19], Loss: 0.0013
Epoch [6/50], Step [10/19], Loss: 0.0140
Epoch [6/50], Step [15/19], Loss: 0.0059
Epoch [7/50], Step [5/19], Loss: 0.0009
Epoch [7/50], Step [10/19], Loss: 0.0097
Epoch [7/50], Step [15/19], Loss: 0.0041
Epoch [8/50], Step [5/19], Loss: 0.0007
Epoch [8/50], Step [10/19], Loss: 0.0071
Epoch [8/50], Step [15/19], Loss: 0.0030
Epoch [9/50], Step [5/19

In [5]:
import preprocessingFunctions as pf

In [6]:
def predict_model(model, vec, raw_text):
    # Return output classification probability given a one neuron output model, a vectorizer, and the raw text to classify
    
    text = pf.getTermMatrixTestData(raw_text, vec).todense()
    
    X_test_tensor = torch.from_numpy(text).float()
    output_prob = model(X_test_tensor)
    
    return output_prob

In [9]:
X_test_text, Y_test, vectorizer_test = getCoronaVocabulary2()

In [10]:
prob_dict = dict()
for i in range(len(X_test_text)):
    output_prob = predict_model(model, vectorizer_train, X_test_text[i])
    prob_dict[X_test_text[i]] = (Y_test[i], output_prob[0].item())

In [11]:
prob_dict_rank_prob = sorted(prob_dict.items(), key=(lambda x: x[1][1]))
prob_dict_rank_prob = np.array(prob_dict_rank_prob)

In [23]:
l = []
for entry in prob_dict_rank_prob:
    l.append(entry[1][1])
print(min(l))

2.9289465297566153e-10


### Top 10 Least Output Probabilities

In [17]:
for entry in prob_dict_rank_prob[:10]:
    print(entry[0])
    print('ACTUAL LABEL:', entry[1][0])
    print('OUTPUT PROBABILITY:', entry[1][1])
    print('')
    print('')

The breakdown of the global liberal world order and its foundations. What is happening now is a global breakdown of the world order. It does not matter at all whether the nature of the coronavirus is artificial or not, nor is it even of principal importance whether, if it is artificial, it was deliberately released by the “world government” or not. The epidemic has begun - it is a fact. Now the main thing is to trace how the "world government" has reacted to it.To clarify, the "world government" is the totality of global political and economic elites and the intellectuals and media (mediacrats) that serve them. Such a "world government" necessarily exists, because on a global scale there are strictly-defined, fundamental norms that determine the basic parameters of politics, economics and ideology.In the economy, the only recognized norm is capitalism, the market economy (which is disputed only by North Korea - not, and this is very important, by China, which presents its own version o

### Top 10 Most Output Probabilities

In [18]:
for entry in prob_dict_rank_prob[-10:]:
    print(entry[0])
    print('ACTUAL LABEL:', entry[1][0])
    print('OUTPUT PROBABILITY:', entry[1][1])
    print('')
    print('')

We still don't fully understand how the new coronavirus spreads, but we're learning more every day.The new coronavirus has upended all of our usual calculus about seemingly ordinary activities. Is running past someone on the street safe? How about shopping in a grocery store with a 6-foot (2 meters) distance? And what about packages and takeout? And which of these activities poses the biggest risk?Unfortunately, there's a lot we still don't know about the way the virus that causes COVID-19 spreads."At this point, I don't think anyone can take a group of people with COVID, say how each person has become infected, and then say that xx% got infected with droplets and yy% got infected via touching surfaces," Dr. Jeffrey N. Martin, a professor in the Department of Epidemiology and Biostatistics at the University of California, San Francisco, told Live Science in an email. "I don't think this kind of study has ever been done for any infection. In most individual persons, we do not know how t

### Top 10 Least Output Probabilities that are incorrectly classified

In [20]:
counter = 0

for entry in prob_dict_rank_prob:
    if (entry[1][0] == 1) and (entry[1][1] < 0.5):
        counter += 1
        print(entry[0])
        print('ACTUAL LABEL:', entry[1][0])
        print('OUTPUT PROBABILITY:', entry[1][1])
        print('')
        print('')
    
    if counter >= 10:
        break

Viral online posts claiming 5G is causing coronavirus are absolutely wrong. Conspiracy theorists are taking them seriously, however, and some are turning violent. Here's why their arguments are nonsense.The false, superstitious belief that 5G cellular networks are somehow causing a global health crisis has found a new conspiracy theory: the idea that the global coronavirus pandemic is caused by 5G. It is not.Since I originally wrote this, 5G conspiracy theories have turned violent. Anti-5G conspiracy theories have fueled 5G tower arson attacks. According to The Guardian, they're driven largely by viral Facebook posts, often from groups mixing in anti-Semitic slurs and conspiracy theories about 9/11. The New York Times suggests a Russian-backed propaganda campaign is in part to blame.
A petition on Change.org claiming that "60 megahertz waves" would "suck the oxygen out of our lungs" (it won't) got more than 114000 signatures before it was deleted. In the US, the conspiracy theories wer

### Top 10 Most Output Probabilities that are incorrectly classified

In [21]:
counter = 0

for entry in prob_dict_rank_prob[::-1]:
    if (entry[1][0] == 0) and (entry[1][1] >= 0.5):
        counter += 1
        print(entry[0])
        print('ACTUAL LABEL:', entry[1][0])
        print('OUTPUT PROBABILITY:', entry[1][1])
        print('')
        print('')
    
    if counter >= 10:
        break

The coronavirus is a very common kind of virus. It is causing an infection in the sinuses, nose or upper throat. It has first identified in the 1960s, but doctors do not know where they come from. This virus gets the name from the crown – like shape. In some cases, coronavirus can infect both animals and humans. Most types of coronaviruses spread the same way as other cold – causing viruses, through infected people sneezing and coughing; by touching the face or hands of the infected person; by touching things, such as doorknobs that infected people have touched.It is noticed that almost everyone gets a coronavirus infection at least once in their life, most likely as a child. It is noticed that the coronavirus is most common in fall and winter, but every single person can get it at any time. The most common symptoms of coronavirus include sore throat, coughing, runny nose and sometimes a fever. There are some cases when people do not know if they suffer from coronavirus, because the sy

### 10 Output Probabilities around 0.5

In [22]:
counter = 0

for entry in prob_dict_rank_prob:
    if entry[1][1] >= 0.5:
        counter += 1
        print(entry[0])
        print('ACTUAL LABEL:', entry[1][0])
        print('OUTPUT PROBABILITY:', entry[1][1])
        print('')
        print('')
    
    if counter >= 10:
        break

Unlike SARS and swine flu, the novel coronavirus is both highly contagious and especially deadly, CNN Chief Medical Correspondent Dr. Sanjay Gupta said.
“SARS was also a coronavirus, and it was a new virus at the time,” Gupta said. “In the end, we know that SARS ended up infecting 8000 people around the world and causing around 800 deaths. So very high fatality rate, but it didn’t turn out to be very contagious.”The swine flu, or H1N1, “was very contagious and infected some 60 million people in the United States alone within a year,” Gupta said. “But it was far less lethal than the flu even — like 1/3 as lethal as the flu.”What makes the novel coronavirus different is that “this is both very contagious … and it appears to be far more lethal than the flu as well,” Gupta said. “So both those things, in combination I think, are why we’re taking this so seriously.”What’s so different about coronavirus that we have to shut down businesses? Why practice social distancing now, when we didn’t 