In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import nltk
import re
import copy
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Loading data files

In [None]:
import os
import json
src_data = dict()
path = '/content/drive/My Drive/nela-gt-2019-json/nela-eng-2019'
for root, dirs, files in os.walk(path):
  for f in sorted(files):
      print("+ Reading", f)
      with open(os.path.join(root, f)) as fin:
          src_data[f] = json.load(fin)


In [None]:
import copy
new_dict= copy.deepcopy(src_data)

## Taking specific fields of the dataset and discarding the rest

In [None]:
news_data = {}

imp_keys = ['source', 'author', 'content', 'title']
for files in src_data:
  dummy = {}
  dummy_list = []
  for entry in src_data[files]:
    dummy = {key:entry[key] for key in imp_keys}
    dummy_list.append(dummy)
  
  news_data[files] = dummy_list


# Loading labels file

In [None]:
path = "/content/drive/My Drive/nela-gt-2019-json/labels.csv"
 
labels = dict()
with open(path) as fin:
  fin.readline()
  for line in fin:
    l = line.strip().split(",")
    source = l[0]
    if l[1] == "":
      labels[source] = 3
      continue
    if l[1] == "1":
      labels[source] = 2
    elif l[1] == "2":
      labels[source] = 1

for s in sorted(labels):
  print(s,labels[s])

In [None]:
path = "/content/drive/My Drive/nela-gt-2019-json/labels.csv"
 
labels = dict()
with open(path) as fin:
  fin.readline()
  for line in fin:
    l = line.strip().split(",")
    source = l[0]
    if l[1] == "":
      labels[source] = 3
      continue
    if l[1] == "1":
      labels[source] = 2
    elif l[1] == "2":
      labels[source] = 1
    else:
      labels[source] = 0

for s in sorted(labels):
  print(s,labels[s])

# Data Preprocessing

In [None]:
regex1 = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.DOTALL)
regex2 = re.compile(r'{\|(.*?)\|}',re.DOTALL)
regex3 = re.compile(r'{{v?cite(.*?)}}',re.DOTALL)
regex4 = re.compile(r'[-.,:;_?()"/\']',re.DOTALL)
regex5 = re.compile(r'\[\[file:(.*?)\]\]',re.DOTALL)
regex6 = re.compile(r"[~`!@#$%-^*+{\[}\]\|\\<>/?]",re.DOTALL)

regex7 = re.compile(r'{{(.*?)}}',re.DOTALL)
regex8 = re.compile(r'<(.*?)>',re.DOTALL)

In [None]:
no_space = re.compile("(&(\w*))|(@(\w*))|(\;)|(\')|(#)|(\.)|(\;)|(\:)|(\!)|(\*)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
space = re.compile("(\-)|(\/)")
single_digits = re.compile(r"\b[A-Za-z]\b")
digits = re.compile("\d+")
extra_spaces = re.compile(r'\s+')
backslash = re.compile(r'\\')

def preprocess_reviews(content):
    
    content = [no_space.sub("", line.lower()) for line in content]
    content = [digits.sub(" ", line) for line in content]
    content = [backslash.sub(" ", line) for line in content]
    content = [extra_spaces.sub(" ", line) for line in content]
    
    return content

In [None]:
def preprocess(title):
  title = title.lower()
  title = regex1.sub(' ', title)
  title = regex2.sub(' ', title)
  title = regex3.sub(' ', title)
  title = regex4.sub(' ', title)
  title = regex5.sub(' ', title)
  title = regex6.sub(' ', title)
  title = regex8.sub(' ', title)
  title = digits.sub(' ', title)
  title = no_space.sub(' ', title)
  title = extra_spaces.sub(' ', title)
  title = backslash.sub(' ', title)
  return title

## Stemming using PyStemmer

In [None]:
!pip install PyStemmer
import Stemmer
stemmer = Stemmer.Stemmer('english')
def stem(sentence):
  word = stemmer.stemWord(sentence)
  return word



## Loading dataset in a dataframe

In [None]:
r1 = re.compile(r'\-')
src=[]
auth=[]
cont=[]
title=[]
lab=[]
imp_keys = ['author', 'content', 'id', 'title']
for files in src_data:
 count = 0
 print(files)
 temp = files[0: len(files)-5]
 temp = r1.sub('', temp)
 if labels[temp] not in [0, 1]:
   continue
 for entry in src_data[files]:
  if count > 500:
    break
  # src.append(entry['source'])
  auth.append(entry['author'])
  text = entry['content']
  text = preprocess(text)
  # text = lemmatize_sentence(text)
  text = stem(text)
  cont.append(text)
  # id.append(entry['id'])
  title.append(entry['title'])
  # print(files)
  temp = files[0: len(files)-5]
  temp = r1.sub('', temp)
  lab.append(labels[temp])
  count += 1
 
 
df = pd.DataFrame(list(zip(auth, cont, title, lab)), 
 columns =['author', 'content', 'title', 'labels'])

# 4 Class Classification (Reliable, Unreliable, Mixed, Unlabelled)

In [None]:
X = copy.deepcopy(df)

In [None]:
y = X['labels']
X = X['content']

## Train Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Tfidf Vectorizer

In [None]:
vectorizer = TfidfVectorizer(stop_words = 'english') #, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Models

# Logistic Regression with Tfidf Vectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

c = [0.001, 0.01, 0.05, 0.5, 0.1]
for i in c:
    model_lr = LogisticRegression(C = i, max_iter = 1000)
    model_lr.fit(X_train, y_train)
    y_pred = model_lr.predict(X_test)
    accuracy_lr = accuracy_score(y_test, y_pred)
    # f1_lr = f1_score(y_test, y_pred, average='micro')
    print ("C = ", i, " Accuracy: ", accuracy_lr)
    # print ("C = ", i, " F1 score: ", f1_lr)

C =  0.001  Accuracy:  0.3787083645710614
C =  0.01  Accuracy:  0.48779051087072933
C =  0.05  Accuracy:  0.5860822534004498
C =  0.5  Accuracy:  0.661802506158295
C =  0.1  Accuracy:  0.6160704723144479


# SVM with Tfidf Vectorizer

In [None]:
from sklearn.svm import LinearSVC

c = [0.001, 0.01, 0.05, 0.5, 0.1]
for i in c:
    model_svm = LinearSVC(C=i)
    model_svm.fit(X_train, y_train)
    y_pred = model_svm.predict(X_test)
    accuracy_svm = accuracy_score(y_test, y_pred)
    # f1_svm = f1_score(y_test, y_pred)
    print ("C = ", i, " Accuracy: ", accuracy_svm)
    # print ("C = ", i, " F1 score: ", f1_svm)

C =  0.001  Accuracy:  0.4578022919567313
C =  0.01  Accuracy:  0.5997108278890436
C =  0.05  Accuracy:  0.6540376994752062
C =  0.5  Accuracy:  0.6860072828531648
C =  0.1  Accuracy:  0.6700492663596445


# Logistic Regression with Count Vectorizer

In [None]:
r1 = re.compile(r'\-')
src=[]
auth=[]
cont=[]
title=[]
lab=[]
imp_keys = ['author', 'content', 'id', 'title']
for files in new_dict:
 count = 0
#  print(files)
 for entry in new_dict[files]:
  if count > 500:
    break
  # src.append(entry['source'])
  auth.append(entry['author'])
  text = entry['content']
  text = preprocess(text)
  # text = lemmatize_sentence(text)
  text = stem(text)
  cont.append(text)
  # id.append(entry['id'])
  title.append(entry['title'])
  # print(files)
  temp = files[0: len(files)-5]
  temp = r1.sub('', temp)
  lab.append(labels[temp])
  count += 1
 
 
df = pd.DataFrame(list(zip(auth, cont, title, lab)), 
 columns =['author', 'content', 'title', 'labels'])

In [None]:
X = copy.deepcopy(df)

In [None]:
y = X['labels']
X = X['content']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
vectorizer = CountVectorizer(stop_words = 'english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

c = [0.001, 0.01, 0.05, 0.5, 0.1]
for i in c:
    model_lr = LogisticRegression(C = i, max_iter = 1000)
    model_lr.fit(X_train, y_train)
    y_pred = model_lr.predict(X_test)
    accuracy_lr = accuracy_score(y_test, y_pred)
    # f1_lr = f1_score(y_test, y_pred, average='micro')
    print ("C = ", i, " Accuracy: ", accuracy_lr)
    # print ("C = ", i, " F1 score: ", f1_lr)

C =  0.001  Accuracy:  0.6027944111776448
C =  0.01  Accuracy:  0.6393067523489606
C =  0.05  Accuracy:  0.6437369164110803


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


C =  0.5  Accuracy:  0.6314200866559564
C =  0.1  Accuracy:  0.6391607029842753


# SVM with Count Vectorizer

In [None]:
from sklearn.svm import LinearSVC

c = [0.001, 0.01, 0.05, 0.5, 0.1]
for i in c:
    model_svm = LinearSVC(C=i)
    model_svm.fit(X_train, y_train)
    y_pred = model_svm.predict(X_test)
    accuracy_svm = accuracy_score(y_test, y_pred)
    # f1_svm = f1_score(y_test, y_pred)
    print ("C = ", i, " Accuracy: ", accuracy_svm)
    # print ("C = ", i, " F1 score: ", f1_svm)

C =  0.001  Accuracy:  0.4578022919567313
C =  0.01  Accuracy:  0.5997108278890436
C =  0.05  Accuracy:  0.6540376994752062
C =  0.5  Accuracy:  0.6860072828531648
C =  0.1  Accuracy:  0.6700492663596445


# Multinomial Naive Bayes

In [None]:
vectorizer = TfidfVectorizer(stop_words = 'english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB().fit(X_train, y_train)

In [None]:
y_pred = MNB.predict(X_test)
y_pred

array([0, 0, 2, ..., 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_mnb = accuracy_score(y_test, y_pred)
accuracy_mnb

0.4638527822403973

# GLOVE embeddings

In [None]:
path = "/content/drive/My Drive/nela-gt-2019-json/labels.csv"
labels = dict()
with open(path) as fin:
  fin.readline()
  for line in fin:
    l = line.strip().split(",")
    source = l[0]
    if l[1] == "":
      labels[source] = 1
      continue
    if(int(l[1]) != 0):
      labels[source]=1
      continue
    labels[source] = int(l[1])
 
for s in sorted(labels):
 print(s,labels[s])

In [None]:
r1 = re.compile(r'\-')
src=[]
auth=[]
cont=[]
title=[]
lab=[]
imp_keys = ['author', 'content', 'id', 'title']
for files in src_data:
 count = 0
#  print(files)
 for entry in src_data[files]:
  if count > 20:
    break
  # src.append(entry['source'])
  auth.append(entry['author'])
  text = entry['content']
  text = preprocess(text)
  # text = lemmatize_sentence(text)
  text = stem(text)
  cont.append(text)
  # id.append(entry['id'])
  title.append(entry['title'])
  # print(files)
  temp = files[0: len(files)-5]
  temp = r1.sub('', temp)
  lab.append(labels[temp])
  count += 1
 
 
df = pd.DataFrame(list(zip(auth, cont, title, lab)), 
 columns =['author', 'content', 'title', 'labels'])

In [None]:
import copy
X = copy.deepcopy(df)

In [None]:
data = []
for i in X['content']:
  data.append(i)

In [None]:
from numpy import array
import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

corpus = data
# corpus
from nltk.tokenize import word_tokenize

all_words = []
for sent in corpus:
    tokenize_word = word_tokenize(sent)
    for word in tokenize_word:
        all_words.append(word)
      
# all_words
unique_words = set(all_words)
print(len(unique_words))
# print(unique_words)
# vocab_length = 20000

# embedded_sentences = [one_hot(sent, vocab_length) for sent in corpus]
# print(embedded_sentences )

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(corpus)
vocab_length = len(word_tokenizer.word_index) + 1
print("vocab_length : ",vocab_length)
embedded_sentences = word_tokenizer.texts_to_sequences(corpus)
# print(embedded_sentences)

word_count = lambda sentence: len(word_tokenize(sentence))
longest_sentence = max(corpus, key=word_count)
length_long_sentence = len(word_tokenize(longest_sentence))
padded_sentences = pad_sequences(embedded_sentences, length_long_sentence, padding='post')
print(padded_sentences)
# y_train = y

In [None]:
print(type(padded_sentences))
padded_sentences.shape

<class 'numpy.ndarray'>


(5256, 12271)

In [None]:
y = X['labels']

In [None]:
X_train = np.array(padded_sentences[0:4200])
X_test = np.array(padded_sentences[4200:])
y_train = np.array(y[0:4200])
y_test = np.array(y[4200:])

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4200, 12271)
(1056, 12271)
(4200,)
(1056,)


In [None]:
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('/content/drive/My Drive/SMAI_Final_Assignment/Q1/glove.6B/glove.6B.100d.txt', encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

In [None]:
embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=length_long_sentence, trainable=False)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 12271, 100)        7757800   
_________________________________________________________________
flatten (Flatten)            (None, 1227100)           0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1227101   
Total params: 8,984,901
Trainable params: 1,227,101
Non-trainable params: 7,757,800
_________________________________________________________________
None


In [None]:
model.fit(X_train, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f28340a9eb8>

In [None]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)

In [None]:
predictions = []
for i in y_pred:
    dis1 = 1.0-i[0]
    dis2 = i[0]-0.0
    ans = 0 if dis1>=dis2 else 1
    predictions.append(ans)

In [None]:
accuracy_glove = accuracy_score(y_test, predictions)
accuracy_glove

0.6316287878787878

# Summary of the models

In [None]:
a1 = ["Logistic Regression using TfIdf vectorizer", accuracy_lr]
a2 = ["SVM using TfIdf vectorizer", accuracy_svm]
a3 = ["Multinomial Naive Bayes using TfIdf vectorizer", accuracy_mnb]
a4 = ["Glove 100d embeddings", accuracy_glove]
data = [a1, a2, a3, a4]
df = pd.DataFrame(data, columns = ['Model', 'Accuracy'])
df



Unnamed: 0,Model,Accuracy
0,Logistic Regression using TfIdf vectorizer,0.661803
1,SVM using TfIdf vectorizer,0.686007
2,Multinomial Naive Bayes using TfIdf vectorizer,0.463853
3,Glove 100d embeddings,0.631629


# BINARY CLASSIFICATION MODELS

## We use only reliable and unreliable classes

## Train Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Tfidf Vectorizer

In [None]:
vectorizer = TfidfVectorizer(stop_words = 'english') #, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Logistic Regression with Tfidf Vectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

c = [0.001, 0.01, 0.05, 0.5, 0.1]
for i in c:
    model_lr = LogisticRegression(C = i, max_iter = 1000)
    model_lr.fit(X_train, y_train)
    y_pred = model_lr.predict(X_test)
    accuracy_lr = accuracy_score(y_test, y_pred)
    f1_lr = f1_score(y_test, y_pred, average='micro')
    print ("C = ", i, " Accuracy: ", accuracy_lr)
    print ("C = ", i, " F1 score: ", f1_lr)

C =  0.001  Accuracy:  0.6498130674737405
C =  0.001  F1 score:  0.6498130674737405
C =  0.01  Accuracy:  0.6600498486736692
C =  0.01  F1 score:  0.6600498486736692
C =  0.05  Accuracy:  0.7557414990208297
C =  0.05  F1 score:  0.7557414990208297
C =  0.5  Accuracy:  0.8396831048602457
C =  0.5  F1 score:  0.8396831048602458
C =  0.1  Accuracy:  0.7902795086345024
C =  0.1  F1 score:  0.7902795086345024


# SVM with Tfidf Vectorizer

In [None]:
from sklearn.svm import LinearSVC

c = [0.001, 0.01, 0.05, 0.5, 0.75, 1]
for i in c:
    model_svm = LinearSVC(C=i)
    model_svm.fit(X_train, y_train)
    y_pred = model_svm.predict(X_test)
    accuracy_svm = accuracy_score(y_test, y_pred)
    f1_svm = f1_score(y_test, y_pred, average='micro')
    print ("C = ", i, " Accuracy: ", accuracy_svm)
    print ("C = ", i, " F1 score: ", f1_svm)

C =  0.001  Accuracy:  0.6550649813067474
C =  0.001  F1 score:  0.6550649813067474
C =  0.01  Accuracy:  0.7828912230728147
C =  0.01  F1 score:  0.7828912230728147
C =  0.05  Accuracy:  0.840484244258501
C =  0.05  F1 score:  0.840484244258501
C =  0.5  Accuracy:  0.8663877514687556
C =  0.5  F1 score:  0.8663877514687556
C =  0.75  Accuracy:  0.8654975965818053
C =  0.75  F1 score:  0.8654975965818053
C =  1  Accuracy:  0.8644294107174648
C =  1  F1 score:  0.8644294107174648


# Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB().fit(X_train, y_train)

In [None]:
y_pred = MNB.predict(X_test)
accuracy_mnb = accuracy_score(y_test, y_pred)
accuracy_mnb

0.7334876268470714

# Loading the mixed and unlabelled categories

In [None]:
r1 = re.compile(r'\-')
src=[]
auth=[]
cont=[]
title=[]
lab=[]
imp_keys = ['author', 'content', 'id', 'title']
for files in src_data:
 count = 0
 temp = files[0: len(files)-5]
 temp = r1.sub('', temp)
 if labels[temp] not in {2, 3}:
   continue
 print(files)
 for entry in src_data[files]:
  if count > 500:
    break
  # src.append(entry['source'])
  auth.append(entry['author'])
  text = entry['content']
  text = preprocess(text)
  # text = lemmatize_sentence(text)
  text = stem(text)
  cont.append(text)
  # id.append(entry['id'])
  title.append(entry['title'])
  # print(files)
  temp = files[0: len(files)-5]
  temp = r1.sub('', temp)
  lab.append(labels[temp])
  count += 1
 
 
df_unlabelled = pd.DataFrame(list(zip(auth, cont, title, lab)), 
 columns =['author', 'content', 'title', 'labels'])

In [None]:
X_unlabelled = df_unlabelled['content']

In [None]:
X_unlabelled = vectorizer.transform(X_unlabelled)

## Predicting the binary classes for Mixed and Unlabelled categories using our trained model

In [None]:
from sklearn.svm import LinearSVC

final_model_svm = LinearSVC(C=0.5)
final_model_svm.fit(X_train, y_train)
y_pred_unlabelled = model_svm.predict(X_unlabelled)

In [None]:
y_pred_unlabelled

array([0, 0, 0, ..., 0, 0, 0])

# Appending predicted labels in dataframe.
## Our final dataframe would have binary labels (0 : reliable, 1 : unreliable)

In [None]:
r1 = re.compile(r'\-')
src=[]
auth=[]
cont=[]
title=[]
lab=[]
count1 = 0
imp_keys = ['author', 'content', 'title']
for files in src_data:
 count = 0
 print(files)
 for entry in src_data[files]:
  if count > 500:
    break
  # src.append(entry['source'])
  auth.append(entry['author'])
  text = entry['content']
  text = preprocess(text)
  # text = lemmatize_sentence(text)
  text = stem(text)
  cont.append(text)
  title.append(entry['title'])
  # print(files)
  temp = files[0: len(files)-5]
  temp = r1.sub('', temp)
  if labels[temp] in {0, 1}:
    lab.append(labels[temp])
  else:
    lab.append(y_pred_unlabelled[count1])
    count1 += 1
  count += 1
 
 
df_final = pd.DataFrame(list(zip(auth, cont, title, lab)), 
 columns =['author', 'content', 'title', 'labels'])

In [None]:
df_final

Unnamed: 0,author,content,title,labels
0,21wire,journalist robert inlakesh reported live from ...,WATCH: Londoners Protest UK Government’s Regim...,1
1,21wire,if you watch the sci tech news regularly you w...,"Digital Wonderland: AI, Transhumanism and Faux...",1
2,21wire,yesterday russia and china carried out what it...,South Korean Air Force Fires Warning Shots at ...,1
3,21wire,kurdish fighters have been used by the us to o...,Will Kurds merge with ISIS as U.S combines pro...,1
4,Nina Cross,image jeremy hunt and amal clooney campaignin...,What’s Behind Jeremy Hunt’s Choice of Amal Clo...,1
...,...,...,...,...
102697,Tyler Durden,a record number of ceos left their positions i...,"""Maybe The Rich See The Writing On The Wall"": ...",1
102698,Tyler Durden,perhaps anticipating the moment iran has to in...,"""Wake-Up Call"" To Europe: Iran Vows Increase O...",1
102699,Tyler Durden,the swedish government will hand out the equiv...,"Swedish Government Grants $175,000 To Fund Dra...",1
102700,Tyler Durden,french president emmanuel macron s now viral e...,"Maybe Just A Coma? Russia Reacts To Macron's ""...",1


# Testing the final model

In [None]:
X = df_final['content']
y = df_final['labels']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Tfidf Vectorizer

In [None]:
vectorizer = TfidfVectorizer(stop_words = 'english') #, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Logistic Regression with Tfidf Vectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

c = [0.001, 0.01, 0.05, 0.5, 0.1]
for i in c:
    model_lr = LogisticRegression(C = i, max_iter = 1000)
    model_lr.fit(X_train, y_train)
    y_pred = model_lr.predict(X_test)
    accuracy_lr = accuracy_score(y_test, y_pred)
    f1_lr = f1_score(y_test, y_pred, average='micro')
    print ("C = ", i, " Accuracy: ", accuracy_lr)
    print ("C = ", i, " F1 score: ", f1_lr)

C =  0.001  Accuracy:  0.6396475341998928
C =  0.001  F1 score:  0.6396475341998928
C =  0.01  Accuracy:  0.6851175697385716
C =  0.01  F1 score:  0.6851175697385716
C =  0.05  Accuracy:  0.7840416727520568
C =  0.05  F1 score:  0.7840416727520568
C =  0.5  Accuracy:  0.8669977118932866
C =  0.5  F1 score:  0.8669977118932866
C =  0.1  Accuracy:  0.8143712574850299
C =  0.1  F1 score:  0.8143712574850299


In [None]:
X_train.shape

(82161, 220511)

In [None]:
X_test.shape

(20541, 220511)

# SVM with Tfidf Vectorizer

In [None]:
from sklearn.svm import LinearSVC

c = [0.001, 0.01, 0.05, 0.5, 0.75, 1]
for i in c:
    model_svm = LinearSVC(C=i)
    model_svm.fit(X_train, y_train)
    y_pred = model_svm.predict(X_test)
    accuracy_svm = accuracy_score(y_test, y_pred)
    f1_svm = f1_score(y_test, y_pred, average='micro')
    print ("C = ", i, " Accuracy: ", accuracy_svm)
    print ("C = ", i, " F1 score: ", f1_svm)

C =  0.001  Accuracy:  0.67314152183438
C =  0.001  F1 score:  0.67314152183438
C =  0.01  Accuracy:  0.8090647972347987
C =  0.01  F1 score:  0.8090647972347987
C =  0.05  Accuracy:  0.866316148191422
C =  0.05  F1 score:  0.866316148191422
C =  0.5  Accuracy:  0.9068205053308018
C =  0.5  F1 score:  0.9068205053308018
C =  0.75  Accuracy:  0.9068205053308018
C =  0.75  F1 score:  0.9068205053308018
C =  1  Accuracy:  0.9057494766564432
C =  1  F1 score:  0.9057494766564432


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.94      0.93     13139
           1       0.89      0.84      0.87      7402

    accuracy                           0.91     20541
   macro avg       0.90      0.89      0.90     20541
weighted avg       0.91      0.91      0.91     20541

