In [1]:
import tweepy
import configparser
import pandas as pd
import numpy as np
import re
import pickle
import nltk
import string
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
import json
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


In [2]:
#load label for train 
train_label_file = open('project-data/train.label.txt', 'r')
train_labels = train_label_file.readlines()
train_labels = [label.strip('\n') for label in train_labels]

#load label for dev 
dev_label_file = open('project-data/dev.label.txt', 'r')
dev_labels = dev_label_file.readlines()
dev_labels = [label.strip('\n') for label in dev_labels]


In [14]:
# open train text file
f = open(f'./tweet_text.pckl','rb')
train_data = pickle.load(f)
f.close()


# open dev text file
f = open(f'./dev_tweet_text.pckl','rb')
dev_data = pickle.load(f)
f.close()

# open test text file
f = open(f'./test_tweet_text.pckl','rb')
test_data = pickle.load(f)
f.close()

In [16]:
## cleaning the tweets
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) #remove @mention
    text = re.sub(r'#','',text) # remove the hashtag symbol
    text = re.sub(r'https?:\/\/\S+', '',text) #remove hyperlink
    text = re.sub(r'\n','',text) # remove \n 
    text = re.sub(r'\r','',text) # remove \r
    text = re.sub(r'[0-9]+','',text) #remove all the number
    text = re.sub(r'\W+', ' ', text) #remove special characters
    text = text.strip().lower()
    if len(text) != 0:
        return text
    else:
        return None
    
for i in range(len(train_data)):
    for j in range(len(train_data[i])):
        train_data[i][j] = clean_text(train_data[i][j])
    train_data[i] = [x for x in train_data[i] if x is not None]
        
for i in range(len(dev_data)):
    for j in range(len(dev_data[i])):
        dev_data[i][j] = clean_text(dev_data[i][j])
    dev_data[i] = [x for x in dev_data[i] if x is not None]
    
for i in range(len(test_data)):
    for j in range(len(test_data[i])):
        test_data[i][j] = clean_text(test_data[i][j])
    test_data[i] = [x for x in test_data[i] if x is not None]

In [7]:
# merge source tweeet and reply tweet together for train data
train_merge_events=[]
for event in train_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    train_merge_events.append(merge)
    
    
# merge source tweeet and reply tweet together for dev data
dev_merge_events=[]
for event in dev_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    dev_merge_events.append(merge)

# merge source tweeet and reply tweet together for test data    
test_merge_events=[]
for event in test_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    test_merge_events.append(merge)

## CounVectorizer

### Training

In [33]:
# need to write manually for better tokenize
cv = CountVectorizer(stop_words='english')
x_train = cv.fit_transform(train_merge_events)
y_train = train_labels

x_dev = cv.transform(dev_merge_events)
y_dev = dev_labels

In [9]:
clfs = [KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),
        MultinomialNB(),LinearSVC(),LogisticRegression()]


In [10]:
from sklearn import model_selection
from sklearn.metrics import accuracy_score, classification_report

def do_multiple_10foldcrossvalidation(clfs,data,classifications):
    for clf in clfs:
        predictions = model_selection.cross_val_predict(clf, data,classifications, cv=10)
        print (clf)
        print ("accuracy")
        print (accuracy_score(classifications,predictions))
        print (classification_report(classifications,predictions))
        
do_multiple_10foldcrossvalidation(clfs,x_train,y_train)


KNeighborsClassifier()
accuracy
0.8142480211081794
              precision    recall  f1-score   support

   nonrumour       0.81      0.99      0.89      1475
      rumour       0.82      0.21      0.33       420

    accuracy                           0.81      1895
   macro avg       0.82      0.60      0.61      1895
weighted avg       0.82      0.81      0.77      1895

DecisionTreeClassifier()
accuracy
0.8221635883905013
              precision    recall  f1-score   support

   nonrumour       0.87      0.91      0.89      1475
      rumour       0.62      0.51      0.56       420

    accuracy                           0.82      1895
   macro avg       0.74      0.71      0.72      1895
weighted avg       0.81      0.82      0.82      1895

RandomForestClassifier()
accuracy
0.8369393139841689
              precision    recall  f1-score   support

   nonrumour       0.83      0.99      0.90      1475
      rumour       0.88      0.31      0.46       420

    accuracy             



LinearSVC()
accuracy
0.8707124010554089
              precision    recall  f1-score   support

   nonrumour       0.89      0.95      0.92      1475
      rumour       0.76      0.61      0.68       420

    accuracy                           0.87      1895
   macro avg       0.83      0.78      0.80      1895
weighted avg       0.86      0.87      0.87      1895

LogisticRegression()
accuracy
0.891292875989446
              precision    recall  f1-score   support

   nonrumour       0.90      0.97      0.93      1475
      rumour       0.85      0.61      0.71       420

    accuracy                           0.89      1895
   macro avg       0.88      0.79      0.82      1895
weighted avg       0.89      0.89      0.88      1895



In [22]:
# k fold to find the optimize hyperparameter

### Evaluation

In [34]:
# Logistic Regression
lr = LogisticRegression()
lr_predict = lr.fit(x_train, y_train).predict(x_dev)    

df = pd.DataFrame({"Predicted": lr_predict}) 
df.to_csv('dev.cv.lr.txt', header=None, index=None)
!python eval.py --predictions dev.cv.lr.txt --groundtruth dev.label.txt

Performance on the rumour class:
Precision = 0.8584905660377359
Recall    = 0.6546762589928058
F1        = 0.7428571428571429


In [35]:
# MultinomialNB()
nb = MultinomialNB(alpha= 0.005)
nb_predict = nb.fit(x_train, y_train).predict(x_dev)    

df = pd.DataFrame({"Predicted": nb_predict}) 
df.to_csv('dev.cv.nb.txt', header=None, index=None)
!python eval.py --predictions dev.cv.nb.txt --groundtruth dev.label.txt

Performance on the rumour class:
Precision = 0.7588652482269503
Recall    = 0.7697841726618705
F1        = 0.7642857142857142


In [36]:
# LinearSVC
SVC = LinearSVC(max_iter=10000)
SVC_predict = SVC.fit(x_train, y_train).predict(x_dev)    

df = pd.DataFrame({"Predicted": SVC_predict}) 
df.to_csv('dev.cv.svc.txt', header=None, index=None)
!python eval.py --predictions dev.cv.svc.txt --groundtruth dev.label.txt

Performance on the rumour class:
Precision = 0.7931034482758621
Recall    = 0.6618705035971223
F1        = 0.7215686274509805


### Testing

In [37]:
x_test = cv.transform(test_merge_events)

In [38]:
nb = MultinomialNB(alpha= 0.005)
nb_predict = nb.fit(x_train, y_train).predict(x_test)    

predict=[]
for result in nb_predict:
    if result == "nonrumour":
        predict.append(0)
    else:
        predict.append(1)
df = pd.DataFrame({"Id": range(len(predict)),"Predicted": predict}) 
df.to_csv('cv_nb.csv',index=False)

# Precision =  0.8 on Kaggle

In [43]:
SVC = LinearSVC(max_iter=10000)
SVC_predict = SVC.fit(x_train, y_train).predict(x_test)    


predict=[]
for result in SVC_predict:
    if result == "nonrumour":
        predict.append(0)
    else:
        predict.append(1)
df = pd.DataFrame({"Id": range(len(predict)),"Predicted": predict}) 
df.to_csv('cv_scv.csv',index=False)

# o.75 on kaggle

In [44]:
lr = LogisticRegression()
lr_predict = lr.fit(x_train, y_train).predict(x_test)    

predict=[]
for result in lr_predict:
    if result == "nonrumour":
        predict.append(0)
    else:
        predict.append(1)
df = pd.DataFrame({"Id": range(len(predict)),"Predicted": predict}) 
df.to_csv('cv_lr.csv',index=False)

# Precision =  0.76744 on Kaggle

## Using td-idf

### Training

In [45]:
# need to write manually for better tokenize
td = TfidfVectorizer(stop_words='english')
x_train = td.fit_transform(train_merge_events)
y_train = train_labels

x_dev = td.transform(dev_merge_events)
y_dev = dev_labels

In [28]:
def do_multiple_10foldcrossvalidation(clfs,data,classifications):
    for clf in clfs:
        predictions = model_selection.cross_val_predict(clf, data,classifications, cv=10)
        print (clf)
        print ("accuracy")
        print (accuracy_score(classifications,predictions))
        print (classification_report(classifications,predictions))
        
do_multiple_10foldcrossvalidation(clfs,x_train,y_train)

KNeighborsClassifier()
accuracy
0.7868073878627968
              precision    recall  f1-score   support

   nonrumour       0.79      1.00      0.88      1475
      rumour       0.79      0.05      0.10       420

    accuracy                           0.79      1895
   macro avg       0.79      0.52      0.49      1895
weighted avg       0.79      0.79      0.71      1895

DecisionTreeClassifier()
accuracy
0.8269129287598944
              precision    recall  f1-score   support

   nonrumour       0.87      0.91      0.89      1475
      rumour       0.63      0.54      0.58       420

    accuracy                           0.83      1895
   macro avg       0.75      0.73      0.74      1895
weighted avg       0.82      0.83      0.82      1895

RandomForestClassifier()
accuracy
0.8253298153034301
              precision    recall  f1-score   support

   nonrumour       0.82      1.00      0.90      1475
      rumour       0.94      0.23      0.36       420

    accuracy             

In [26]:
# k fold to find the optimize hyperparameter

### Evaluate

In [46]:
# Logistic Regression
lr = LogisticRegression()
lr_predict = lr.fit(x_train, y_train).predict(x_dev)    

df = pd.DataFrame({"Predicted": lr_predict}) 
df.to_csv('dev.tf.lr.txt', header=None, index=None)
!python eval.py --predictions dev.tf.lr.txt --groundtruth dev.label.txt

Performance on the rumour class:
Precision = 0.975609756097561
Recall    = 0.28776978417266186
F1        = 0.4444444444444444


In [47]:
# MultinomialNB()
nb = MultinomialNB(alpha= 0.005)
nb_predict = nb.fit(x_train, y_train).predict(x_dev)    

df = pd.DataFrame({"Predicted": nb_predict}) 
df.to_csv('dev.tf.nb.txt', header=None, index=None)
!python eval.py --predictions dev.tf.nb.txt --groundtruth dev.label.txt

Performance on the rumour class:
Precision = 0.8148148148148148
Recall    = 0.7913669064748201
F1        = 0.8029197080291969


In [48]:
# LinearSVC
SVC = LinearSVC(max_iter=10000)
SVC_predict = SVC.fit(x_train, y_train).predict(x_dev)    

df = pd.DataFrame({"Predicted": SVC_predict}) 
df.to_csv('dev.tf.svc.txt', header=None, index=None)
!python eval.py --predictions dev.tf.svc.txt --groundtruth dev.label.txt

Performance on the rumour class:
Precision = 0.8990825688073395
Recall    = 0.7050359712230215
F1        = 0.7903225806451614


### Testing

In [49]:
x_test = td.transform(test_merge_events)

In [50]:
nb = MultinomialNB(alpha= 0.005)
nb_predict = nb.fit(x_train, y_train).predict(x_test)    

predict=[]
for result in nb_predict:
    if result == "nonrumour":
        predict.append(0)
    else:
        predict.append(1)
df = pd.DataFrame({"Id": range(len(predict)),"Predicted": predict}) 
df.to_csv('tf_nb.csv',index=False)

# Precision =  0.82 on Kaggle

In [51]:
SVC = LinearSVC(max_iter=10000)
SVC_predict = SVC.fit(x_train, y_train).predict(x_test)    


predict=[]
for result in SVC_predict:
    if result == "nonrumour":
        predict.append(0)
    else:
        predict.append(1)
df = pd.DataFrame({"Id": range(len(predict)),"Predicted": predict}) 
df.to_csv('tf_scv.csv',index=False)

# Precision =  0.85714 on Kaggle

In [52]:
lr = LogisticRegression()
lr_predict = lr.fit(x_train, y_train).predict(x_test)    

predict=[]
for result in lr_predict:
    if result == "nonrumour":
        predict.append(0)
    else:
        predict.append(1)
df = pd.DataFrame({"Id": range(len(predict)),"Predicted": predict}) 
df.to_csv('tf_lr.csv',index=False)

# Precision =  0.57142 on Kaggle