In [1]:
import tweepy
import configparser
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import pickle
import nltk
import string
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer

In [6]:
# read data 
train_data_file = open('project-data/train.data.txt', 'r')
train_lines = train_data_file.readlines()
train_events =[]
# Strips the newline character
for line in train_lines:
    train_events.append(list(map(int,line.strip('\n').split(','))))
    
train_label_file = open('project-data/train.label.txt', 'r')
train_labels = train_label_file.readlines()
train_labels = [label.strip('\n') for label in train_labels]


dev_data_file = open('project-data/dev.data.txt', 'r')
dev_lines = dev_data_file.readlines()
dev_events =[]
# Strips the newline character
for line in dev_lines:
    dev_events.append(list(map(int,line.strip('\n').split(','))))
    
dev_label_file = open('project-data/dev.label.txt', 'r')
dev_labels = dev_label_file.readlines()
dev_labels = [label.strip('\n') for label in dev_labels]

In [9]:
# config to access tweeter API
config = configparser.ConfigParser()
config.read('config.ini')

consumer_key = config['twitter']['consumer_key']
consumer_secret = config['twitter']['consumer_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

In [10]:
# authentication
client = tweepy.Client(consumer_key=consumer_key, consumer_secret=consumer_secret,
                                   access_token=access_token, access_token_secret=access_token_secret,wait_on_rate_limit=True)

In [11]:
# get_tweets only return 100 results, handle the case when there is more than 100
def lookup_tweets(tweet_IDs, client):
    full_tweets = []
    tweet_count = len(tweet_IDs)
    for i in range(int((tweet_count / 100) + 1)):
        # Catch the last group if it is less than 100 tweets
        end_loc = min((i + 1) * 100, tweet_count)
        if tweet_IDs[i * 100:end_loc]:
            tweets = client.get_tweets(tweet_IDs[i * 100:end_loc],user_auth=True).data
            if tweets:
                full_tweets.extend(tweets)
    return full_tweets
    

In [12]:
# get the text of all events
train_events_text=[]
for event in train_events:
    results = lookup_tweets(event, client)
    train_event_text=[tweet.text for tweet in results]
    train_events_text.append(train_event_text)

KeyboardInterrupt: 

In [None]:
# save data to pickle file
f = open(f'./tweet_text.pckl','wb')
pickle.dump(train_events_text,f)
f.close()

In [None]:
# get the text of all events
dev_events_text=[]
for event in dev_events:
    results = lookup_tweets(event, client)
    dev_event_text=[tweet.text for tweet in results]
    dev_events_text.append(dev_event_text)

In [171]:
# save data to pickle file
f = open(f'./dev_tweet_text.pckl','wb')
pickle.dump(dev_events_text,f)
f.close()

In [13]:
# open train text file
f = open(f'./tweet_text.pckl','rb')
train_data = pickle.load(f)
f.close()


# open dev text file
f = open(f'./dev_tweet_text.pckl','rb')
dev_data = pickle.load(f)
f.close()

In [17]:
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) #remove @mention
    text = re.sub(r'#','',text) # remove the hashtag symbol
    text = re.sub(r'https?:\/\/\S+', '',text) #remove hyperlink
    text = re.sub(r'\n','',text) # remove \n 
    text = re.sub(r'\W+', '', text) #remove special characters
    return text

for i in range(len(train_data)):
    for j in range(len(train_data[i])):
        train_data[i][j] = clean_text(train_data[i][j])
        
for i in range(len(dev_data)):
    for j in range(len(dev_data[i])):
        dev_data[i][j] = clean_text(dev_data[i][j])

In [19]:
# merge source tweeet and reply tweet together for train data
train_merge_events=[]
for event in train_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    train_merge_events.append(merge)
    
    
# merge source tweeet and reply tweet together for dev data
dev_merge_events=[]
for event in dev_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    dev_merge_events.append(merge)

In [26]:
def tokenize_tweet(tweet):
    """Get all of the tokens in a set of tweets"""
    twt = nltk.tokenize.TweetTokenizer()
    # combine stop words and punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    stop = stopwords + list(string.punctuation)
    # filter out stop words and punctuation and send to lower case
    tokens = [token.lower() for token in twt.tokenize(tweet)
              if token.lower() not in stop]
    tokens = [word for word in tokens if re.search('[a-zA-Z]',word) is not None] # filter out word not contain alphabet
    return(tokens)

In [27]:
def tokenize_tweetv2(tweet):
    """Get all of the tokens in a set of tweets"""
    twt = nltk.tokenize.TweetTokenizer()
    # combine stop words and punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    stop = stopwords + list(string.punctuation)
    # create the stemmer
    stemmer = nltk.stem.porter.PorterStemmer()
    # filter out stop words and punctuation and send to lower case
    tokens = [ stemmer.stem(token) for token in twt.tokenize(tweet)
              if token.lower() not in stop]
    return(tokens)

### Normal bag of word

In [28]:
# Create bag of word 
def bow(data,labels):
    x = []
    y = []
    for i in range(len(data)):
        tokens = tokenize_tweet(data[i])
        
        vocab = collections.defaultdict(int)
        for word in tokens:
            vocab[word] += 1 
        x.append(vocab)
        y.append(labels[i])
    return x,y
    

In [29]:
x_train,y_train = bow(train_merge_events,train_labels)
x_dev,y_dev = bow(dev_merge_events,dev_labels)

In [30]:
vectorizer = DictVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_dev = vectorizer.transform(x_dev)

In [32]:
# k fold to find the optimize hyperparameter
alphas = [0.001,0.005,0.01,0.1,0.3,0.5,1]
max_nb = 0
for alpha in alphas:
    nb = MultinomialNB(alpha=alpha)
    nb_predict = nb.fit(x_train, y_train).predict(x_dev)    
    nb_accuracy = accuracy_score(y_dev,nb_predict)
    print('With alpha = {alpha} the accuracy of Naive Bayes is {acc:.5f}'.format(alpha=alpha, acc = nb_accuracy))
    if nb_accuracy > max_nb:
        max_nb = nb_accuracy
        max_alpha = alpha
print("The best setting for Naive Bayes is alpha = {alpha} with accuracy = {acc:.5f}".format(alpha=max_alpha,acc=max_nb))


With alpha = 0.001 the accuracy of Naive Bayes is 0.90032
With alpha = 0.005 the accuracy of Naive Bayes is 0.89715
With alpha = 0.01 the accuracy of Naive Bayes is 0.88924
With alpha = 0.1 the accuracy of Naive Bayes is 0.88608
With alpha = 0.3 the accuracy of Naive Bayes is 0.87816
With alpha = 0.5 the accuracy of Naive Bayes is 0.88133
With alpha = 1 the accuracy of Naive Bayes is 0.88608
The best setting for Naive Bayes is alpha = 0.001 with accuracy = 0.90032


In [34]:
solvers = ['newton-cg', 'lbfgs', 'liblinear','sag','saga']
c_values = [ 100,10,1.0, 0.1, 0.01,0.001]
max_lr = 0
for solver in solvers:
    print('Using this solver ',solver )
    for c_value in c_values:
        lr = LogisticRegression(C=c_value, penalty='l2', solver=solver,max_iter=1000)
        lr_predict = lr.fit(x_train, y_train).predict(x_dev)    
        lr_accuracy = accuracy_score(y_dev,lr_predict)
        print('With C = {c} and solver  = {sol} the acciracy of Logistic Regression is {acc}'.format(c=c_value,sol=solver,acc= lr_accuracy))
        if lr_accuracy > max_lr:
            max_lr = lr_accuracy
            max_c_value = c_value
            max_solver = solver
print("The best setting for Logistic Regression is c = {c} and solver = {sol} with accuracy = {acc:.5f}".format(c=max_c_value,sol=max_solver,acc=max_lr))

Using this solver  newton-cg
With C = 100 and solver  = newton-cg the acciracy of Logistic Regression is 0.9113924050632911
With C = 10 and solver  = newton-cg the acciracy of Logistic Regression is 0.9145569620253164
With C = 1.0 and solver  = newton-cg the acciracy of Logistic Regression is 0.9129746835443038
With C = 0.1 and solver  = newton-cg the acciracy of Logistic Regression is 0.8876582278481012
With C = 0.01 and solver  = newton-cg the acciracy of Logistic Regression is 0.8433544303797469
With C = 0.001 and solver  = newton-cg the acciracy of Logistic Regression is 0.7958860759493671
Using this solver  lbfgs
With C = 100 and solver  = lbfgs the acciracy of Logistic Regression is 0.9113924050632911
With C = 10 and solver  = lbfgs the acciracy of Logistic Regression is 0.9145569620253164
With C = 1.0 and solver  = lbfgs the acciracy of Logistic Regression is 0.9129746835443038
With C = 0.1 and solver  = lbfgs the acciracy of Logistic Regression is 0.8876582278481012
With C = 0.



With C = 100 and solver  = sag the acciracy of Logistic Regression is 0.9272151898734177




With C = 10 and solver  = sag the acciracy of Logistic Regression is 0.9272151898734177




With C = 1.0 and solver  = sag the acciracy of Logistic Regression is 0.9272151898734177




With C = 0.1 and solver  = sag the acciracy of Logistic Regression is 0.9224683544303798
With C = 0.01 and solver  = sag the acciracy of Logistic Regression is 0.9145569620253164
With C = 0.001 and solver  = sag the acciracy of Logistic Regression is 0.8781645569620253
Using this solver  saga




With C = 100 and solver  = saga the acciracy of Logistic Regression is 0.9272151898734177




With C = 10 and solver  = saga the acciracy of Logistic Regression is 0.9272151898734177




With C = 1.0 and solver  = saga the acciracy of Logistic Regression is 0.9272151898734177




With C = 0.1 and solver  = saga the acciracy of Logistic Regression is 0.9240506329113924
With C = 0.01 and solver  = saga the acciracy of Logistic Regression is 0.9145569620253164
With C = 0.001 and solver  = saga the acciracy of Logistic Regression is 0.8718354430379747
The best setting for Logistic Regression is c = 100 and solver = sag with accuracy = 0.92722


In [57]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

clfs = [KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),
        MultinomialNB(),LinearSVC(),LogisticRegression()]


In [58]:
from sklearn import model_selection
from sklearn.metrics import accuracy_score, classification_report

def do_multiple_10foldcrossvalidation(clfs,data,classifications):
    for clf in clfs:
        predictions = model_selection.cross_val_predict(clf, data,classifications, cv=10)
        print (clf)
        print ("accuracy")
        print (accuracy_score(classifications,predictions))
        print (classification_report(classifications,predictions))
        
do_multiple_10foldcrossvalidation(clfs,x_train,y_train)


KNeighborsClassifier()
accuracy
0.7915567282321899
              precision    recall  f1-score   support

   nonrumour       0.79      1.00      0.88      1475
      rumour       0.86      0.07      0.13       420

    accuracy                           0.79      1895
   macro avg       0.82      0.53      0.51      1895
weighted avg       0.81      0.79      0.72      1895

DecisionTreeClassifier()
accuracy
0.833245382585752
              precision    recall  f1-score   support

   nonrumour       0.88      0.91      0.89      1475
      rumour       0.64      0.56      0.60       420

    accuracy                           0.83      1895
   macro avg       0.76      0.73      0.75      1895
weighted avg       0.83      0.83      0.83      1895

RandomForestClassifier()
accuracy
0.8401055408970977
              precision    recall  f1-score   support

   nonrumour       0.83      0.99      0.91      1475
      rumour       0.93      0.30      0.45       420

    accuracy              

### Using td-idf

In [55]:
# need to write manually for better tokenize
td = TfidfVectorizer(stop_words='english')
x_train = td.fit_transform(train_merge_events)
x_dev = td.transform(dev_merge_events)

#from sklearn.feature_extraction.text import CountVectorizer
#vectorizer = CountVectorizer(stop_words='english')
#x_train  = vectorizer.fit_transform(train_merge_events)
#x_dev = vectorizer.transform(dev_merge_events)

In [56]:
# k fold to find the optimize hyperparameter
alphas = [0.001,0.005,0.01,0.1,0.3,0.5,1]
max_nb = 0
for alpha in alphas:
    nb = MultinomialNB(alpha=alpha)
    nb_predict = nb.fit(x_train, y_train).predict(x_dev)    
    nb_accuracy = accuracy_score(y_dev,nb_predict)
    print('With alpha = {alpha} the accuracy of Naive Bayes is {acc:.5f}'.format(alpha=alpha, acc = nb_accuracy))
    if nb_accuracy > max_nb:
        max_nb = nb_accuracy
        max_alpha = alpha
print("The best setting for Naive Bayes is alpha = {alpha} with accuracy = {acc:.5f}".format(alpha=max_alpha,acc=max_nb))

With alpha = 0.001 the accuracy of Naive Bayes is 0.91456
With alpha = 0.005 the accuracy of Naive Bayes is 0.91772
With alpha = 0.01 the accuracy of Naive Bayes is 0.90981
With alpha = 0.1 the accuracy of Naive Bayes is 0.91772
With alpha = 0.3 the accuracy of Naive Bayes is 0.88608
With alpha = 0.5 the accuracy of Naive Bayes is 0.85285
With alpha = 1 the accuracy of Naive Bayes is 0.79905
The best setting for Naive Bayes is alpha = 0.005 with accuracy = 0.91772
