In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
import random
import pandas as pd
from sklearn.model_selection import train_test_split

import nltk 
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
englishStemmer = SnowballStemmer("english")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
hyptun = pd.read_csv("/content/drive/MyDrive/THESIS_May15/hyptun.csv")
cotrn = pd.read_csv("/content/drive/MyDrive/THESIS_May15/cotrn.csv")

In [None]:
hyptun.head()

Unnamed: 0.1,Unnamed: 0,transcripts,labels
0,272,and to be honest with you I felt like that el...,1
1,510,Flat Earth clues part 3 the mapmakers this is ...,1
2,162,all right so 911 fear-mongering if they can ma...,1
3,446,according to a report in WorldNetDaily some of...,1
4,176,You So everybody When I never met in person I ...,1


In [None]:
cotrn.head()

Unnamed: 0.1,Unnamed: 0,transcripts,labels
0,1,get my sex hey guys we are going up to the ol...,1
1,2,in March of 2016 Conor McGregor was set to mak...,0
2,3,[ ♪♪ ] [ ♪♪ ] >> Bob: It's like something\nfro...,1
3,5,hi I'm coach chicken lat a business-focused ma...,0
4,6,okay so I just started the rinsing process and...,0


In [None]:
all_youtube = pd.concat([hyptun, cotrn], axis = 0, ignore_index = True)
all_youtube.shape

(578, 3)

In [None]:
all_youtube.drop("Unnamed: 0", axis = 1, inplace = True)

In [None]:
def clean_text (given_text):

  message = re.sub(r'(http[s]?\S+)', ' ', given_text) #Replace URLs with space because it might be too freq in this class   
  message = re.sub(r'(w{2}\.\S+)', ' ', message) #Replace URLs with space because it might be too freq in this class    
  message = re.sub(r'&[a-zA-Z]*', ' ', message) #also another code to remove the stuff with ampersand    
  message = re.sub(r'[^\w\d\s]', ' ', message) #punctuation removed (it removes * too!)
  message = re.sub(r'\d+(\.\d+)?', ' ', message)  #Replace numbers with space   
  message = re.sub(r'\s+', ' ', message) #too muchs space removed
  message = re.sub(r'^\s+|\s+?$', '', message.lower()) #Get rid of spaces at the beginning and at the end

  return message

In [None]:
all_youtube["transcripts"] = all_youtube["transcripts"].apply(clean_text)

In [None]:
all_youtube.head()

Unnamed: 0,transcripts,labels
0,and to be honest with you i felt like that ele...,1
1,flat earth clues part the mapmakers this is pa...,1
2,all right so fear mongering if they can make e...,1
3,according to a report in worldnetdaily some of...,1
4,you so everybody when i never met in person i ...,1


In [None]:
def stop_stem(given_text):

  return ' '.join(englishStemmer.stem(term) for term in given_text.split() if term not in set(stop_words))

In [None]:
all_youtube["transcripts"] = all_youtube["transcripts"].apply(stop_stem) 

Initially splitting the dataset as train test. Then 4/9th of the train will be masked away in terms of its labels. 
Remaining will be labelled training set. Both in benchmark and co-training setups, they will be stratified into 5 folds with respect to their label distributions. 

In [None]:
train_val_trns, test_trns, train_val_lbl, test_lbl = train_test_split(all_youtube["transcripts"], all_youtube["labels"], test_size = 0.2, random_state = 94)
labelled_val_trns, unlabelled_trns, labelled_val_lbl, _ = train_test_split(all_youtube["transcripts"], all_youtube["labels"], test_size = 4/9, random_state = 94)

In [None]:
labelled_val_lbl.value_counts()

0    183
1    138
Name: labels, dtype: int64

Here, the data we see has a ratio of 1.32. So, we can still go with the 3:2 ratio for negative and positive prediction ratios.  

In [None]:
rem_trns, val_fold1_trns, rem_lbl, val_fold1_lbl =  train_test_split(labelled_val_trns, labelled_val_lbl, test_size = 0.2, stratify = labelled_val_lbl, random_state = 94)
rem_trns_2, val_fold2_trns, rem_lbl_2, val_fold2_lbl =  train_test_split(rem_trns, rem_lbl, test_size = 0.25, stratify = rem_lbl, random_state = 94)
rem_trns_3, val_fold3_trns, rem_lbl_3, val_fold3_lbl =  train_test_split(rem_trns_2, rem_lbl_2, test_size = 0.33, stratify = rem_lbl_2, random_state = 94)
val_fold5_trns, val_fold4_trns, val_fold5_lbl, val_fold4_lbl =  train_test_split(rem_trns_3, rem_lbl_3, test_size = 0.5, stratify = rem_lbl_3, random_state = 94)

In [None]:
print(val_fold1_trns.shape)
print(val_fold2_trns.shape)
print(val_fold3_trns.shape)
print(val_fold4_trns.shape)
print(val_fold5_trns.shape)

(65,)
(64,)
(64,)
(64,)
(64,)


In [None]:
val_folds_trns_in_list = [val_fold1_trns, val_fold2_trns, val_fold3_trns, val_fold4_trns, val_fold5_trns]
val_folds_lbl_in_list = [val_fold1_lbl, val_fold2_lbl, val_fold3_lbl, val_fold4_lbl, val_fold5_lbl]

From now on, we are trying the co-training script to see whether it is working well. 

In [None]:
import os
os.chdir("/content/drive/MyDrive/THESIS_May15")

In [None]:
import co_training_script_smaller_2_outcomes as ctsb
from gensim.models.doc2vec import Doc2Vec

Running with the actual hyperparams - Hyperparameter tuning

In [None]:
model_8 = Doc2Vec.load("d2v_model_embdsize_8.model")
model_24 = Doc2Vec.load("d2v_model_embdsize_24.model")
model_64 = Doc2Vec.load("d2v_model_embdsize_64.model")
model_256 = Doc2Vec.load("d2v_model_embdsize_256.model")

models = [model_8, model_24, model_64, model_256]
feats_tfidf = [700, 1000, 2000, 4000]
p_cands = [2, 4]
n_cands = [3, 6]
CC_cands = [0.1, 1, 10]

In [None]:
best_tfidf = -1
best_doc2vec = -1
best_tfidf_index = [-1] * 4
best_doc2vec_index = [-1] * 4

for im, model in enumerate(models): 
  for iff, feat_tfidf in enumerate(feats_tfidf): 
    for ipn, (p_val, n_val) in enumerate(zip(p_cands, n_cands)):
      for ic, cc_val in enumerate(CC_cands): 
        
        tfidf_f1, doc2vec_f1 = ctsb.cotraining_val(model, val_folds_trns_in_list, val_folds_lbl_in_list, unlabelled_trns, feat_tfidf, p_val, n_val, cc_val, 0.43) 

        if tfidf_f1 > best_tfidf: 
          best_tfidf = tfidf_f1
          best_tfidf_index = [im, iff, ipn, ic]

        if doc2vec_f1 > best_doc2vec: 
          best_doc2vec = doc2vec_f1
          best_doc2vec_index = [im, iff, ipn, ic]      

In [None]:
print(best_tfidf)

In [None]:
print(best_tfidf_index)

In [None]:
print(best_doc2vec)

In [None]:
print(best_doc2vec_index)

Testing the performance with best hyperparameters for tfidf and doc2vec classifiers

In [None]:
#testing tfidf

ctsb.cotraining_test(model_256, labelled_val_trns, labelled_val_lbl, test_trns, test_lbl, unlabelled_trns, 700, 4, 6, 1, 1)

0.8048780487804877

In [None]:
#testing doc2vec

ctsb.cotraining_test(model_256, labelled_val_trns, labelled_val_lbl, test_trns, test_lbl, unlabelled_trns, 2000, 4, 6, 1, 0)

0.5617977528089888