In [None]:
import tweepy
import configparser
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import pickle
import nltk
import string
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer

In [None]:
# read data 
train_data_file = open('project-data/train.data.txt', 'r')
train_lines = train_data_file.readlines()
train_events =[]
# Strips the newline character
for line in train_lines:
    train_events.append(list(map(int,line.strip('\n').split(','))))
    
train_label_file = open('project-data/train.label.txt', 'r')
train_labels = train_label_file.readlines()
train_labels = [label.strip('\n') for label in train_labels]


dev_data_file = open('project-data/dev.data.txt', 'r')
dev_lines = dev_data_file.readlines()
dev_events =[]
# Strips the newline character
for line in dev_lines:
    dev_events.append(list(map(int,line.strip('\n').split(','))))
    
dev_label_file = open('project-data/dev.label.txt', 'r')
dev_labels = dev_label_file.readlines()
dev_labels = [label.strip('\n') for label in dev_labels]

In [None]:
# config to access tweeter API
config = configparser.ConfigParser()
config.read('config.ini')

consumer_key = config['twitter']['consumer_key']
consumer_secret = config['twitter']['consumer_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

In [None]:
# authentication
client = tweepy.Client(consumer_key=consumer_key, consumer_secret=consumer_secret,
                                   access_token=access_token, access_token_secret=access_token_secret,wait_on_rate_limit=True)

In [None]:
# get_tweets only return 100 results, handle the case when there is more than 100
def lookup_tweets(tweet_IDs, client):
    full_tweets = []
    tweet_count = len(tweet_IDs)
    for i in range(int((tweet_count / 100) + 1)):
        # Catch the last group if it is less than 100 tweets
        end_loc = min((i + 1) * 100, tweet_count)
        if tweet_IDs[i * 100:end_loc]:
            tweets = client.get_tweets(tweet_IDs[i * 100:end_loc],user_auth=True).data
            if tweets:
                full_tweets.extend(tweets)
    return full_tweets
    

In [None]:
# get the text of all events
train_events_text=[]
for event in train_events:
    results = lookup_tweets(event, client)
    train_event_text=[tweet.text for tweet in results]
    train_events_text.append(train_event_text)

In [None]:
# save data to pickle file
f = open(f'./tweet_text.pckl','wb')
pickle.dump(train_events_text,f)
f.close()

In [None]:
# get the text of all events
dev_events_text=[]
for event in dev_events:
    results = lookup_tweets(event, client)
    dev_event_text=[tweet.text for tweet in results]
    dev_events_text.append(dev_event_text)

In [None]:
# save data to pickle file
f = open(f'./dev_tweet_text.pckl','wb')
pickle.dump(dev_events_text,f)
f.close()

In [None]:
# open train text file
f = open(f'./tweet_text.pckl','rb')
train_data = pickle.load(f)
f.close()


# open dev text file
f = open(f'./dev_tweet_text.pckl','rb')
dev_data = pickle.load(f)
f.close()

In [None]:
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) #remove @mention
    text = re.sub(r'#','',text) # remove the hashtag symbol
    text = re.sub(r'https?:\/\/\S+', '',text) #remove hyperlink
    text = re.sub(r'\n','',text) # remove \n 
    return text

for i in range(len(train_data)):
    for j in range(len(train_data[i])):
        train_data[i][j] = clean_text(train_data[i][j])
        
for i in range(len(dev_data)):
    for j in range(len(dev_data[i])):
        dev_data[i][j] = clean_text(dev_data[i][j])

In [None]:
# merge source tweeet and reply tweet together for train data
train_merge_events=[]
for event in train_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    train_merge_events.append(merge)
    
    
# merge source tweeet and reply tweet together for dev data
dev_merge_events=[]
for event in dev_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    dev_merge_events.append(merge)

In [None]:
def tokenize_tweet(tweet):
    """Get all of the tokens in a set of tweets"""
    twt = nltk.tokenize.TweetTokenizer()
    # combine stop words and punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    stop = stopwords + list(string.punctuation)
    # filter out stop words and punctuation and send to lower case
    tokens = [token.lower() for token in twt.tokenize(tweet)
              if token.lower() not in stop]
    tokens = [word for word in tokens if re.search('[a-zA-Z]',word) is not None] # filter out word not contain alphabet
    return(tokens)

In [None]:
def tokenize_tweetv2(tweet):
    """Get all of the tokens in a set of tweets"""
    twt = nltk.tokenize.TweetTokenizer()
    # combine stop words and punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    stop = stopwords + list(string.punctuation)
    # create the stemmer
    stemmer = nltk.stem.porter.PorterStemmer()
    # filter out stop words and punctuation and send to lower case
    tokens = [ stemmer.stem(token) for token in twt.tokenize(tweet)
              if token.lower() not in stop]
    return(tokens)

In [None]:
# Create bag of word 
def bow(data,labels):
    x = []
    y = []
    for i in range(len(data)):
        tokens = tokenize_tweet(data[i])
        
        vocab = collections.defaultdict(int)
        for word in tokens:
            vocab[word] += 1 
        x.append(vocab)
        y.append(labels[i])
    return x,y
    

In [None]:
x_train,y_train = bow(train_merge_events,train_labels)
x_dev,y_dev = bow(dev_merge_events,dev_labels)

In [None]:
vectorizer = DictVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_dev = vectorizer.transform(x_dev)

In [None]:
from keras.layers import LSTM
from keras.models import Sequential
from keras import layers

In [None]:
model3 = Sequential(name="lstm")
model3.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model3.add(LSTM(10))
model3.add(layers.Dense(1, activation='sigmoid'))
model3.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model3.summary()

In [None]:
model3.fit(xseq_train, y_train, epochs=20, verbose=True, validation_data=(xseq_dev, y_dev), batch_size=10)

loss, accuracy = model3.evaluate(xseq_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
## extract embedding to see the similiarity between tweets???