In [11]:
import tweepy
import configparser
import pandas as pd
import numpy as np
import re
import pickle
import nltk
import string
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder

In [3]:
train_label_file = open('project-data/train.label.txt', 'r')
train_labels = train_label_file.readlines()
train_labels = [label.strip('\n') for label in train_labels]
    
dev_label_file = open('project-data/dev.label.txt', 'r')
dev_labels = dev_label_file.readlines()
dev_labels = [label.strip('\n') for label in dev_labels]

In [4]:
# open train text file
f = open(f'./tweet_text.pckl','rb')
train_data = pickle.load(f)
f.close()


# open dev text file
f = open(f'./dev_tweet_text.pckl','rb')
dev_data = pickle.load(f)
f.close()

In [5]:
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) #remove @mention
    text = re.sub(r'#','',text) # remove the hashtag symbol
    text = re.sub(r'https?:\/\/\S+', '',text) #remove hyperlink
    text = re.sub(r'\n','',text) # remove \n 
    text = re.sub(r'\r','',text) # remove \r
    text = re.sub(r'\\W+', ' ', text) #remove special characters\n",
    return text

for i in range(len(train_data)):
    for j in range(len(train_data[i])):
        train_data[i][j] = clean_text(train_data[i][j]).lower()
        
for i in range(len(dev_data)):
    for j in range(len(dev_data[i])):
        dev_data[i][j] = clean_text(dev_data[i][j]).lower()

In [6]:
# merge source tweeet and reply tweet together for train data
train_merge_events=[]
for event in train_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    train_merge_events.append(merge)
    
    
# merge source tweeet and reply tweet together for dev data
dev_merge_events=[]
for event in dev_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    dev_merge_events.append(merge)

In [66]:
train_merge_events[0]

'5. can regularly rinsing your nose with saline help prevent infection with the new coronavirus? 4. can eating garlic help prevent infection with the new coronavirus? covid19malaysia 6. do vaccines against pneumonia protect you against the new coronavirus? 7. can spraying alcohol or chlorine all over your body kill the new coronavirus? chamber 8. how effective are thermal scanners in detecting people infected with the new coronavirus? 9. can an ultraviolet disinfection lamp kill the new coronavirus? 10. are hand dryers effective in killing the new coronavirus? 11. the new coronavirus cannot be transmitted through mosquito bites. 12. taking a hot bath does not prevent the new coronavirus disease 13. cold weather and snow cannot kill the new coronavirus. 14. covid-19 virus can be transmitted in areas with hot and humid climates 15. drinking alcohol does not protect you against covid-19 and can be dangerous 16. being able to hold your breath for 10 seconds or more without coughing or feel

In [15]:
train_df = pd.DataFrame({'text':train_merge_events, 'label':train_labels})
train_df['label'] = LabelEncoder().fit_transform(train_df['label'])
nan_value = float("NaN")
train_df.replace("", nan_value, inplace=True)
train_df.dropna(axis=0 ,inplace=True)
train_df

Unnamed: 0,text,label
0,5. can regularly rinsing your nose with saline...,0
1,french police chief killed himself after charl...,1
2,coronavirus disease (covid-19) advice for the ...,0
3,ottawa police confirm that there were multiple...,0
4,if the primary focus of a government isn't to ...,0
...,...,...
1890,desperate ted cruz claims planned parenthood s...,1
1891,"""thoughts and prayers are not enough."" pres. o...",1
1892,police have surrounded this building where the...,0
1893,"joseph smith, who translated it by the gift...",0


In [29]:
dev_df = pd.DataFrame({'text':dev_merge_events, 'label':dev_labels})
dev_df['label'] = LabelEncoder().fit_transform(dev_df['label'])
nan_value = float("NaN")
dev_df.replace("", nan_value, inplace=True)
dev_df.dropna(axis=0 ,inplace=True)
dev_df

Unnamed: 0,text,label
0,covid-19 fact:are hand dryers effective in kil...,0
1,when can we expect the result of my husband's...,0
2,how does covid-19 spread? people can catch cov...,0
3,"every news outlet using headlines like,""are an...",0
4,researcher on his encounter with a goliath bi...,0
...,...,...
627,"or cure for covid-19. however, there are sever...",0
628,"after speculation that he’s been arrested, ban...",1
629,*your questions answered*❓*reply with the numb...,0
630,"►anonymous operation kkk ►ku klux klan, we nev...",1


In [65]:
train_df['text'][0]

'5. can regularly rinsing your nose with saline help prevent infection with the new coronavirus? 4. can eating garlic help prevent infection with the new coronavirus? covid19malaysia 6. do vaccines against pneumonia protect you against the new coronavirus? 7. can spraying alcohol or chlorine all over your body kill the new coronavirus? chamber 8. how effective are thermal scanners in detecting people infected with the new coronavirus? 9. can an ultraviolet disinfection lamp kill the new coronavirus? 10. are hand dryers effective in killing the new coronavirus? 11. the new coronavirus cannot be transmitted through mosquito bites. 12. taking a hot bath does not prevent the new coronavirus disease 13. cold weather and snow cannot kill the new coronavirus. 14. covid-19 virus can be transmitted in areas with hot and humid climates 15. drinking alcohol does not protect you against covid-19 and can be dangerous 16. being able to hold your breath for 10 seconds or more without coughing or feel

In [81]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [83]:
y_train = [1 if x== 'nonrumour' else 0 for x in train_labels]
y_dev = [1 if x== 'nonrumour' else 0 for x in dev_labels]

In [96]:
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(train_df['text'].to_numpy())
x_train = tokenizer.texts_to_sequences(train_df['text'].to_numpy())
x_dev = tokenizer.texts_to_sequences(dev_df['text'].to_numpy())
y_train = train_df['label'].to_numpy()
y_dev = dev_df['label'].to_numpy()

In [98]:
x_train_bow = tokenizer.texts_to_matrix(train_df['text'].to_numpy(), mode="count")

In [101]:
vocab_size = x_train_bow.shape[1]

In [102]:
vocab_size

26116

In [46]:
max_train = 0
for each in x_train:
    l = len(each)
    if l > max_train:
        max_train = l

max_dev = 0
for each in x_dev:
    l = len(each)
    if l > max_dev:
        max_dev = l

if max_train > max_dev:
    maxlen = max_train
else:
    maxlen = max_dev

In [83]:
xseq_train = pad_sequences(x_train, padding='post', maxlen=512)
xseq_dev = pad_sequences(x_dev, padding='post', maxlen=512)

In [84]:
from keras.layers import LSTM
from keras.models import Sequential
from keras import layers

In [86]:
vocab_size = x_train.shape[1]
embedding_dim = 10
vocab_size

512

In [103]:
model3 = Sequential(name="lstm")
model3.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=512))
model3.add(LSTM(10))
model3.add(layers.Dense(1, activation='sigmoid'))
model3.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model3.summary()

Model: "lstm"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 512, 10)           261160    
_________________________________________________________________
lstm_5 (LSTM)                (None, 10)                840       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 262,011
Trainable params: 262,011
Non-trainable params: 0
_________________________________________________________________


In [104]:
model3.fit(xseq_train, y_train, epochs=20, verbose=True, validation_data=(xseq_dev, y_dev), batch_size=10)

loss, accuracy = model3.evaluate(xseq_dev, y_dev, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1807 samples, validate on 595 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Testing Accuracy:  0.7731


In [None]:
## extract embedding to see the similiarity between tweets???

In [106]:
f = open(f'./test_tweet_text.pckl','rb')
test_data = pickle.load(f)
f.close()

In [107]:
for i in range(len(test_data)):
    for j in range(len(test_data[i])):
        test_data[i][j] = clean_text(test_data[i][j]).lower()

In [108]:
test_merge_events=[]
for event in test_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    test_merge_events.append(merge)

In [109]:
test_df = pd.DataFrame({'text':test_merge_events})

In [110]:
nan_value = float("NaN")

test_df.replace("", nan_value, inplace=True)

test_df.dropna(axis=0 ,inplace=True)

In [111]:
test_df

Unnamed: 0,text
0,"how does covid-19 spread? thanks, wcco! you ..."
1,"_warrior i hate to keep saying it, but capital..."
2,q. how are covid-19 and influenza viruses diff...
3,una de les q&amp;a on coronaviruses de la pàgi...
4,@_truthpolitics we should absolutely blame the...
...,...
553,ex-marlboro man dies from smoking-related dise...
554,holy shit. doritos flavored mountain dew.all i...
555,banksy account joins cartoonists support for c...
556,_europe q: how are the members of an int...


In [151]:
model3.evaluate(xseq_dev, y_dev, verbose=False)

[0.47003184426732425, 0.7731092572212219]

In [153]:
len(y_dev)

595

In [114]:
x_test = tokenizer.texts_to_sequences(test_df['text'].to_numpy())

In [115]:
xseq_test = pad_sequences(x_test, padding='post', maxlen=512)

In [159]:
prediction = model3.predict(xseq_test)

In [160]:
predict= prediction.argmax(axis=-1)

In [161]:
predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [162]:
df = pd.DataFrame({"Id": range(len(predict)),"Predicted": predict}) 

In [164]:
df.to_csv('lstm_predict.csv',index=False)