In [2]:
import json
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import seaborn as sns

In [3]:
train_data = json.load(open('total_train_data.json'))
train_labels = json.load(open('total_train_data_labels.json'))

dev_data = json.load(open('total_dev_data.json'))
dev_labels = json.load(open('total_dev_data_labels.json'))

In [4]:
# pip install emoji

In [5]:
import re
import emoji

def process_data(data, data_labels):
    data_dict = {}
    
    tweet_ids = []
    tweet_texts = []
    tweet_retweets = []
    
    for i in range(len(data)):
        tweet_text = clean_tweet(data.get(str(i)).get('source tweet text'))
        
        retweets = data.get(str(i)).get('retweets')
        if retweets:
            retweets = list(map(lambda x:clean_tweet(x), retweets))
        
        tweet_ids.append(data.get(str(i)).get('source tweet id'))
        tweet_texts.append(tweet_text)
        tweet_retweets.append(retweets)
        
    data_dict['tweet_ids'] = tweet_ids
    data_dict['tweet_texts'] = tweet_texts
    data_dict['tweet_retweets'] = tweet_retweets
    
    if not data_labels:
        data_dict['tweet_labels'] = 2
    else:
        data_dict['tweet_labels'] = data_labels
    
    df = pd.DataFrame(data = data_dict)
    return df

def clean_tweet(text):
    # Remove hashtag while keeping hashtag text
#     text = re.sub(r'#', '', text)
    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    # Remove tickers
    text = re.sub(r'\$\w*', '', text)
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    # Remove URL, RT, mention(@)
    text = re.sub(r'http(\S)+', '', text)
    text = re.sub(r'http ...', '', text)
    text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text)
    text = re.sub(r'RT[ ]?@', '', text)
    text = re.sub(r'@[\S]+', '', text)

    text = re.sub(r'&amp;?', 'and', text)
    text = re.sub(r'&lt;', '<', text)
    text = re.sub(r'&gt;', '>', text)

    # Remove emoji
    text = emoji.demojize(text)

    # Remove redundent whitespace (including new line characters)
    text = re.sub(r'\s\s+', '', text)
    text = re.sub(r'[ ]{2, }', '', text)

    return text

In [6]:
df_train = process_data(train_data,train_labels)
df_dev = process_data(dev_data,dev_labels)

In [7]:
df_train.head()

Unnamed: 0,tweet_ids,tweet_texts,tweet_retweets,tweet_labels
0,1250219300389974016,5. Can regularly rinsing your nose with saline...,[4. Can eating garlic help prevent infection w...,0
1,554886875303780352,French police chief killed himself after #Char...,"[ How very sad., The trauma he must have faced...",1
2,1237901309011021825,Coronavirus disease (COVID-19) advice for the ...,[Infection control for suspected or confirmed ...,0
3,524958128392376320,Ottawa police confirm that there were multiple...,"[ Killers go berserk when cornered.Henceforth,...",0
4,1239295488677085185,if the primary focus of a government isn't to ...,,0


In [8]:
def combine_text(data):
    train_lst = []
    for i in range(int(data.shape[0])):

        tweets = data['tweet_texts'][i]
        retweets = data['tweet_retweets'][i]
        if type(retweets) == list:

            for tweet in retweets:
                tweets += tweet

        train_lst.append(tweets)
    return train_lst

In [9]:
train_lst = combine_text(df_train)

In [10]:
def transform_label(data):
    label_lst = []
    labels = data['tweet_labels']
    for label in labels:
        if label == 1:
            label_lst.append('rumour')
        else:
            label_lst.append('nonrumour')
    return label_lst


In [54]:
# label_lst = transform_label(df_train)
# dev_labels = transform_label(df_dev)

In [55]:
dev_labels  = df_dev['tweet_labels']
dev_labels

0      0
1      0
2      0
3      0
4      0
      ..
530    0
531    1
532    0
533    1
534    0
Name: tweet_labels, Length: 535, dtype: int64

# SMOTE

In [12]:
#pip install imbalanced-learn

In [13]:
#pip install SMOTE

In [73]:
from imblearn.over_sampling import SMOTE

In [15]:
tfidf_vectorizer = TfidfVectorizer(min_df = 2,max_df = 0.5,ngram_range = (1,2))
tfidf = tfidf_vectorizer.fit_transform(train_lst)


In [16]:
# tfidf
smote = SMOTE(random_state = 402)
X_smote_tfidf, Y_smote_tfidf = smote.fit_resample(tfidf,label_lst)


# SVM

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_validate

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn import datasets, svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier


In [88]:
##########using SMOTE 
count_vectorizer = CountVectorizer(stop_words="english", decode_error="ignore")
count_vector = count_vectorizer.fit_transform(train_lst)
smote = SMOTE(random_state = 402)
X_smote, Y_smote = smote.fit_resample(count_vector,label_lst)


In [93]:
SVM_clf=Pipeline([
                ("tfidf",TfidfTransformer(use_idf=True)),
                ("clf", svm.SVC(kernel='linear', gamma=0.7, C=1.0))])



SVMparameters={
    "clf__gamma":[0.6,0.7,0.5,0.4,0.2],
    "clf__C":[0.5,0.7,1.0,2.0,3.0]
}
SVM_clf_text = GridSearchCV(SVM_clf,SVMparameters, cv = 5)
SVM_clf_text = SVM_clf_text.fit(X_smote, Y_smote)

In [89]:
SVM_clf_text.best_score_

0.9899598393574296

In [95]:

dev_count_vector = count_vectorizer.transform(dev_lst)

In [96]:
predict_results = svm_clf.predict(dev_count_vector)
from sklearn.metrics import f1_score,accuracy_score
dev_f1 = f1_score(predict_results,dev_labels,average='micro')
dev_f1

0.8523364485981308

In [97]:
test_count_vector = count_vectorizer.transform(test_lst)
test_predict_results = svm_clf.predict(test_count_vector)

In [None]:
def store_result(predict_results):
    final_result = []
    for result in predict_results:
        if result == 'rumour':
            final_result.append(1)
        else:
            final_result.append(0)

    index = [i for i in range(len(final_result))]

    d = {}
    d['Id'] = index
    d['Predicted']=final_result

    result_df = pd.DataFrame(d)
    result_df

    result_df.to_csv(r'/Users/lingyiqing/学习资料/IT/2022 s1/NLP/group assignment/project-data/test_result.csv',index=False)

In [98]:
store_result(test_predict_results)

In [None]:
#SVM with countervect
# SVM_clf=Pipeline([("vect", CountVectorizer(stop_words="english", decode_error="ignore")), 
#                 ("tfidf",TfidfTransformer(use_idf=True)),
#                 ("clf", svm.SVC(kernel='linear', gamma=0.7, C=1.0))])



# SVMparameters={
#     "clf__gamma":[0.6,0.7,0.5,0.4,0.2],
#     "clf__C":[0.5,0.7,1.0,2.0,3.0]
# }
# SVM_clf_text = GridSearchCV(SVM_clf,SVMparameters, cv = 5)
# SVM_clf_text = SVM_clf_text.fit(X_smote_tfidf, Y_smote_tfidf)

In [76]:
#SVM with countervect
# SVM_clf=Pipeline([("vect", CountVectorizer(stop_words="english", decode_error="ignore")), 
#                 ("tfidf",TfidfTransformer(use_idf=True)),
#                 ("clf", svm.SVC(kernel='linear', gamma=0.7, C=1.0))])


SVM_clf=Pipeline([("clf", svm.SVC(kernel='linear', gamma=0.7, C=1.0))])

SVMparameters={
    "clf__gamma":[0.6,0.7,0.5,0.4,0.2],
    "clf__C":[0.5,0.7,1.0,2.0,3.0]
}
SVM_clf_text = GridSearchCV(SVM_clf,SVMparameters, cv = 5)
SVM_clf_text = SVM_clf_text.fit(X_smote_tfidf, Y_smote_tfidf)

In [31]:
SVM_clf_text.best_score_

0.9899598393574296

In [87]:
dev_lst = combine_text(df_dev)

In [84]:
dev_tfidf = tfidf_vectorizer.transform(dev_lst)
predict_dev_tfidf = SVM_clf_text.predict(dev_tfidf)

predict_result_tfidf = []
for i in predict_dev_tfidf:
    if i =='rumour':
        predict_result_tfidf.append(1)
    else:
        predict_result_tfidf.append(0)
        
dev_f1_tfidf = f1_score(predict_result_tfidf,dev_labels,average='weighted')
dev_f1_tfidf

0.9512467977264509

In [85]:
from sklearn.metrics import precision_recall_fscore_support
p, r, f, _ = precision_recall_fscore_support(dev_labels, predict_result_tfidf, pos_label=1, average="binary")

In [86]:
f

0.8720379146919433

# using tfidf+SMOTE+SVM on test set data


In [90]:
test_tfidf = tfidf_vectorizer.transform(test_lst)
predict_test_tfidf = SVM_clf_text.predict(test_tfidf)

In [87]:
store_result(predict_test_tfidf)

In [68]:
# count_vect = CountVectorizer()
# tfidf_transformer = TfidfTransformer()
# X_test_counts = count_vect.transform(dev_lst)
# X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# baseline

In [48]:
dev_labels

0      0
1      0
2      0
3      0
4      0
      ..
530    0
531    1
532    0
533    1
534    0
Name: tweet_labels, Length: 535, dtype: int64

In [70]:
# baseline 
from sklearn.metrics import f1_score,accuracy_score
predict_results = [1] * len(dev_lst)
dev_f1 = f1_score(predict_results,dev_labels,average='micro')
dev_f1

0.21495327102803738

In [25]:

# load dataset
test_data = json.load(open('total_test_data.json'))
df_test = process_data(test_data,None)


In [102]:
test_data['213']

{'source tweet id': '668849913678209024',
 'source tweet text': 'That Superman poster is legit:\n\nhttps://t.co/fgnhH9kcJY https://t.co/dmikEp0igi',
 'retweets': ['@snopes *ahem* So is this one. https://t.co/8cPWDOQgGs',
  '@darksaber2k @snopes drops mic.... Walks out.',
  '@snopes @CalBear949 https://t.co/Y87YHn3MPS has the old superman radio series. At least 1 is anti white supremacist: play nice with others',
  '@snopes @mattkummer Well, Superman IS an illegal alien.',
  '@snopes @comex mnyhhyyb']}

In [74]:
df_test[10:20]

Unnamed: 0,tweet_ids,tweet_texts,tweet_retweets,tweet_labels
10,555072815154475008,69 people die after drinking beer believed to ...,"[Crossing Mozambique off my travel list, check...",2
11,629503919098429440,Chick-Fil-A to open on Sundays,"[ NOT COOL MAN, ]",2
12,1229732608889802753,Q: What can I do to protect myself from #COVID...,"[The question which cannot be answered is w… ,...",2
13,489836441120145408,First pics from the site in Ukraine where #MH1...,"[ photo courtesy?, Some may disagree with Maha...",2
14,1240570885662289920,Should I wear a mask to protect myself from th...,[Can masks protect against the #COVID19infecti...,2
15,1234884616479051777,Can the virus that causes COVID-19 be transmit...,"[How does COVID-19 spread? , Can CoVID-19 be c...",2
16,1244004581010550785,"Unter Überschrift ""Can CoVID-19 be caught from...","[Ja, die Menschen halten sich eben ganz oft ni...",2
17,1248902780556693506,Wondering what is COVID-19 Hotspot? Watch this...,"[physical lock down will be released, as much ...",2
18,1249529725019738113,What is COVID-19 status on 12.04.2020 ????,[Happy to share that one Covid positive case o...,2
19,1248769432748466177,Are antibiotics effective in preventing and tr...,[Can spraying alcohol or chlorine all over you...,2


In [26]:
test_lst = combine_text(df_test)
test_labels = transform_label(df_test)

In [59]:
predict_results = SVM_clf_text.predict(test_lst)
predict_results

array(['nonrumour', 'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'rumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'rumour', 'rumour', 'nonrumour', 'rumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'nonrumour', 'rumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour', 'rumour',
       'nonrumour', 'rumour', 'rumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour', 'rumour',
       'nonrumour', 'rumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'rumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'rumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'rumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'rumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', '

In [58]:
from random import sample
import collections
l = len(dev_lst)
index_lst = [i for i in range(l)]
n = int(l*0.2)
rumour_index_lst = sample(index_lst,  n)
print(n)
predict_results = []
for i in range(l):
    if i in rumour_index_lst:
        predict_results.append(1)
    else:
        predict_results.append(0)
        


107


In [64]:
predict_results = [1]*len(dev_lst)

In [71]:
from sklearn.metrics import precision_recall_fscore_support
p, r, f, _ = precision_recall_fscore_support(dev_labels, predict_results, pos_label=1, average="binary")

In [72]:
f

0.35384615384615387

In [62]:
################################
# naive bayes
NB_clf=Pipeline([("vect", CountVectorizer(stop_words="english", decode_error="ignore")), 
                ("tfidf",TfidfTransformer()),
                ("clf", MultinomialNB())])
NBparameters={
     'tfidf__use_idf':(True,False),
    'clf__alpha':[0,0.001,0.01,0.1,0.2],
    'clf__fit_prior':[True,False]
}
NB_clf = GridSearchCV(NB_clf,NBparameters, cv = 5, n_jobs=-1)
NB_clf=NB_clf.fit(train_lst, label_lst)
NB_clf.best_score_

0.9097976570820021

In [63]:
predict_results = NB_clf.predict(dev_lst)
dev_f1 = f1_score(predict_results,dev_labels,average='micro')
dev_f1

0.9177570093457944

In [64]:
predict_results = NB_clf.predict(test_lst)
predict_results

array(['nonrumour', 'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'rumour', 'rumour', 'nonrumour', 'nonrumour',
       'rumour', 'rumour', 'nonrumour', 'rumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'nonrumour', 'rumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'rumour', 'nonrumour', 'rumour',
       'nonrumour', 'rumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'rumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'rumour', 'nonrumour', 'nonrumour',
       'rumour', 'rumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'rumour', 'nonrumour', 'nonrumour', 'nonrumour',
       'nonrumour', 'nonrumour', '

In [24]:
#################################
df1 = pd.read_csv('/Users/lingyiqing/Downloads/project data/1.csv')
df2 = pd.read_csv('/Users/lingyiqing/Downloads/project data/2.csv')
df3 = pd.read_csv('/Users/lingyiqing/Downloads/project data/3.csv')
df4 = pd.read_csv('/Users/lingyiqing/Downloads/project data/4.csv')
df5 = pd.read_csv('/Users/lingyiqing/Downloads/project data/5.csv')
df6 = pd.read_csv('/Users/lingyiqing/Downloads/project data/6.csv')

In [41]:
dataset = [df1, df2, df3, df4, df5, df6]
dataset[0]

Unnamed: 0,Id,Predicted
0,0,0
1,1,1
2,2,0
3,3,0
4,4,0
...,...,...
553,553,0
554,554,0
555,555,1
556,556,0


In [3]:
from collections import defaultdict
d = defaultdict(list)



In [4]:
def compare_dataset(dataset1, dataset2, dic):
    predict1 = dataset1['Predicted']
    predict2 = dataset2['Predicted']
    
    for i in range(len(predict1)):
        if predict1[i] != predict2[i]:
            dic[i].append(predict2[i])
    return dic 
