In [1]:
import pandas as pd
import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import Image

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation

from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.feature_extraction import text
from tqdm import tqdm, tqdm_notebook

import json

In [2]:
df = pd.read_csv('../data/opioid_tweets_label.csv').drop(columns = ["Unnamed: 0"])
bad_tweets = pd.read_csv('../bad_tweets.txt', header = None)

In [3]:
df

Unnamed: 0,id,tweet_id,content,created_at,fav_count,url_present,user_name,followers_count,friends_count,user_description,label
0,1,1.182370e+18,"Boston Police, Public Health Officials To Trea...",10/10/19,662.0,True,SaraCarterDC,836793.0,4656.0,"@FoxNews Contributor, award winning National S...",0
1,2,1.182120e+18,Three #Chinese nationals were charged last wee...,10/10/19,302.0,True,EpochTimes,134594.0,102.0,"An independent, award-winning voice in print &...",0
2,3,1.182360e+18,Three #Chinese nationals were charged with imp...,10/10/19,164.0,True,EpochTimes,134594.0,102.0,"An independent, award-winning voice in print &...",0
3,4,1.182720e+18,Boston is using a chemical warfare device to h...,10/11/19,0.0,True,BUSPH,27642.0,2202.0,The official Twitter of Boston University Scho...,0
4,5,1.182720e+18,This makes no sense given what President Trump...,10/11/19,0.0,True,FlagHiApp,1893.0,4547.0,"FlagHi™ calculates how temperature, elevation ...",0
...,...,...,...,...,...,...,...,...,...,...,...
42933,588727,1.187910e+18,Thanks....haven't got Motrin PM..trying Naprox...,10/26/19,2.0,True,CarolMc29382003,3729.0,4996.0,#Trump2020#MAGA#KAGA#NRA.No DM NO Dating.. don...,0
42934,588728,1.187910e+18,@thistallawkgirl One year my husband dressed u...,10/26/19,6.0,True,beatalley,1738.0,4035.0,Beat Alley-Denver's Music Webzine - Vintage M...,0
42935,588729,1.187910e+18,I fractured my growth plate when I was 12 and ...,10/26/19,1.0,True,depressedloc,349.0,341.0,Scientist and minor Prophet #FreeSanchez #Free...,0
42936,588730,1.187910e+18,@_Daks_ Vicodin messed me the fuck up. Like fo...,10/26/19,0.0,False,Road_Block,910.0,365.0,"Gamer, podcaster, JMM on DungeonDrunks! RT Si...",0


In [4]:
# Creating a list of stopwords
stops = stopwords.words('english')
# Add stop variants without single quotes
no_quotes = [re.sub(r'\'','',word) for word in stops if "'" in word]
my_stop_words = ["codeine", "hydrocodone", "morphine", "oxycodone", "hydromorphone", "fentanyl", "oxycontin", "vicodin", "percocet"]
stops.extend(no_quotes)
stops.extend(my_stop_words)
def clean_string(string):
    # remove HTML entities
    temp = re.sub(r'\&\w*;','', string)
    # remove @user
    temp = re.sub(r'@(\w+)','', temp)
    # remove links
    temp = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+','', temp)
    # lowercase
    temp = temp.lower()
    # remove hashtags
#     temp = re.sub(r'#(\w+)','', temp)
    # remove repeating characters
    temp = re.sub(r'(.)\1{1,}',r'\1\1', temp)
    # remove non-letters
    temp = re.sub("[^a-zA-Z]"," ", temp)
    # remove anything that is less than two characters
    temp = re.sub(r'\b\w{1,2}\b','',temp)
    # remove multiple spaces
    temp = re.sub(r'\s\s+', ' ', temp)
    return temp

def str_preprocess(string):
    stemmer = PorterStemmer()
    # removing punctuation
    removed_punc = ''.join([char for char in string if char not in punctuation])
    # removing stopwords
    cleaned = [stemmer.stem(word.lower()) for word in removed_punc.split(' ') if word not in stops]
    return ' '.join(cleaned)

In [5]:
#my_stop_words = text.ENGLISH_STOP_WORDS.union(["codeine", "hydrocodone", "morphine", "oxycodone", "hydromorphone", "fentanyl", "oxycontin", "vicodin", "percocet"])
docs = df.content.astype(str)
cleaned_frame = docs.apply(clean_string).apply(str_preprocess)
td_idf_vec = TfidfVectorizer(stop_words=my_stop_words, max_features = 20000)
X = td_idf_vec.fit_transform(cleaned_frame)
X_norm = normalize(X)
X_arr = X_norm.toarray()

In [31]:
df_x = df.drop(columns = ["id","tweet_id","created_at", "user_name", "user_description", "url_present", "content"])
df_final = pd.concat([pd.DataFrame(X_arr),df_x], axis = 1).dropna(axis = 0)

In [32]:
df_final_norm = pd.DataFrame(normalize(df_final.values), columns = df_final.columns)

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df_final_norm, test_size=0.20, random_state=42)

In [35]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19994,19995,19996,19997,19998,19999,fav_count,followers_count,friends_count,label
952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.269259,0.963068,0.0
39918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004724,0.410949,0.911646,0.0
35082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.408869,0.912593,0.0
19271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.579257,0.815140,0.0
12272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.020686,0.715151,0.698664,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.005480,0.509637,0.860355,0.0
11284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002219,0.774548,0.632510,0.0
38158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001342,0.231469,0.972841,0.0
860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000437,0.442661,0.896689,0.0


In [39]:
import torch
import torchvision
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.utils import save_image
from torchvision.datasets import MNIST
import os

class OpioidTwitterData(Dataset):
    def __init__(self, data):
        self.data = torch.FloatTensor(data.values.astype('float'))    
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        target = self.data[index][-1]
        data_val = self.data[index] [:-1]
        return data_val,target
    
train_dataset = OpioidTwitterData(X_train)
valid_dataset = OpioidTwitterData(X_test)
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [45]:
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()

        self.fc1 = nn.Linear(20003, 400)
        self.fc21 = nn.Linear(400, 20)
        self.fc22 = nn.Linear(400, 20)
        self.fc3 = nn.Linear(20, 400)
        self.fc4 = nn.Linear(400, 20003)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparametrize(self, mu, logvar):
        std = logvar.mul(0.5).exp_()
        if torch.cuda.is_available():
            eps = torch.cuda.FloatTensor(std.size()).normal_()
        else:
            eps = torch.FloatTensor(std.size()).normal_()
        eps = Variable(eps)
        return eps.mul(std).add_(mu)

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return F.sigmoid(self.fc4(h3))

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparametrize(mu, logvar)
        return self.decode(z), mu, logvar


model = VAE()
if torch.cuda.is_available():
    model.cuda()

reconstruction_function = nn.MSELoss(size_average=False)


def loss_function(recon_x, x, mu, logvar):
    """
    recon_x: generating images
    x: origin images
    mu: latent mean
    logvar: latent log variance
    """
    BCE = reconstruction_function(recon_x, x)  # mse loss
    # loss = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar)
    KLD = torch.sum(KLD_element).mul_(-0.5)
    # KL divergence
    return BCE + KLD

num_epochs = 5
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(dataloader):
        img, _ = data
        img = img.view(img.size(0), -1)
        img = Variable(img)
        if torch.cuda.is_available():
            img = img.cuda()
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(img)
        loss = loss_function(recon_batch, img, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch,
                batch_idx * len(img),
                len(dataloader.dataset), 100. * batch_idx / len(dataloader),
                loss.item() / len(img)))

    print('====> Epoch: {} Average loss: {:.4f}'.format(
        epoch, train_loss / len(dataloader.dataset)))
    if epoch % 10 == 0:
        save = to_img(recon_batch.cpu().data)
        save_image(save, './vae_img/image_{}.png'.format(epoch))

torch.save(model.state_dict(), './vae.pth')



KeyboardInterrupt: 