# News Classification Model using Keras

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# install all packages needed

#!pip3 install sklearn --upgrade
#!pip3 install pickle --update
!pip3 install transformers
!pip3 install pymysql



In [3]:
# import all libraries needed

import pandas as pd
from pandas import read_csv
import os
import spacy
import numpy as np
import matplotlib.pyplot as plt

# nltk used for parsing and cleaning text
import nltk
import unicodedata
import string
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from difflib import SequenceMatcher
from scipy import spatial
from itertools import combinations

# used to acccess the sql database
import pymysql
# library that helps turn dataframes into sql tables
from sqlalchemy import create_engine


import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.linear_model import LogisticRegression

import pickle

## for deep learning
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
current_dir = os.getcwd()
current_dir

'/content'

In [5]:
# create connection
connection = pymysql.connect(host='news-data-rdb.cqsnaejqwcpu.ap-southeast-2.rds.amazonaws.com',
                             user='admin',
                             password='badpassword1',
                             db='news_data')

In [6]:
# SQL query to retreive our data for the model
SQL_Query = pd.read_sql_query(
        '''
        select
        *
        from new_table
        ''', connection)

In [7]:
SQL_Query

Unnamed: 0,news_id,text,label
0,0,house aide comey letter jason chaffetz tweeted...,1
1,1,flynn hillary clinton woman campus breitbart f...,0
2,2,truth fired truth fired october tension intell...,1
3,3,civilians killed single airstrike identified v...,1
4,4,iranian woman jailed fictional unpublished sto...,1
...,...,...,...
941403,941403,buzzfeed year awkward convo dude texting trump...,1
941404,941404,littlebytesnews tcot news variety facebook too...,1
941405,941405,signsinyork getting right company logo busines...,1
941406,941406,latest obama affirms continuity ties canada ht...,1


In [8]:
news_sql_df = SQL_Query[0:20000]
news_sql_df

Unnamed: 0,news_id,text,label
0,0,house aide comey letter jason chaffetz tweeted...,1
1,1,flynn hillary clinton woman campus breitbart f...,0
2,2,truth fired truth fired october tension intell...,1
3,3,civilians killed single airstrike identified v...,1
4,4,iranian woman jailed fictional unpublished sto...,1
...,...,...,...
19995,19995,rembrandt proves chapter genesis rembrandt pro...,1
19996,19996,obama rising west antichrist obama rising west...,1
19997,19997,planet nibiru slovakia planet nibiru slovakia ...,1
19998,19998,book alien races exposed book alien races expo...,1


In [9]:
news_sql_df.to_csv(f'{current_dir}/drive/MyDrive/colab_data/news_sql_df.csv', index=False)

In [5]:
news_sql_df = pd.read_csv(f'{current_dir}/drive/MyDrive/colab_data/news_sql_df.csv', index_col=False)

In [6]:
news_sql_df

Unnamed: 0,news_id,text,label
0,0,house aide comey letter jason chaffetz tweeted...,1
1,1,flynn hillary clinton woman campus breitbart f...,0
2,2,truth fired truth fired october tension intell...,1
3,3,civilians killed single airstrike identified v...,1
4,4,iranian woman jailed fictional unpublished sto...,1
...,...,...,...
19995,19995,rembrandt proves chapter genesis rembrandt pro...,1
19996,19996,obama rising west antichrist obama rising west...,1
19997,19997,planet nibiru slovakia planet nibiru slovakia ...,1
19998,19998,book alien races exposed book alien races expo...,1


In [8]:
# Obtain the total words present in the dataset
list_of_words = []
for i in news_sql_df.text:
  words = i.split(' ')
  for j in words:
      list_of_words.append(j)

In [9]:
# Obtain the total number of unique words
total_words = len(list(set(list_of_words)))
total_words

118348

In [10]:
# split data into test and train 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(news_sql_df.text.to_list(), news_sql_df.label.to_list(), test_size = 0.1)

In [11]:
vectorizer = TfidfVectorizer(analyzer = 'word',
                            input = 'content',
                            lowercase = True,
                            token_pattern = '(?u)\\b\\w\\w+\\b',
                            min_df = 3,
                            ngram_range = (1,1))

In [12]:
vectorizer.fit(x_train)

x_train = vectorizer.transform(x_train).toarray() # shape - (3,6)
x_train = x_train[:, :, None] # shape - (3,6,1) since LSTM cells expects ndims = 3

x_test = vectorizer.transform(x_test).toarray() # shape - (3,6)
x_test = x_test[:, :, None] # shape - (3,6,1) since LSTM cells expects ndims = 3

In [13]:
# turn binary labels intp a numpy array
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

## Model

In [14]:
# Sequential Model
model = Sequential()

# embeddidng layer to do PCA 
# total_words = defined the total number of vocabs 
model.add(Embedding(total_words, output_dim = 64))

# Bi-Directional RNN and LSTM
model.add(Bidirectional(LSTM(64)))

# Dense layers
model.add(Dense(64, activation = 'relu'))
model.add(Dense(1, activation= 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          7574272   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 7,648,641
Trainable params: 7,648,641
Non-trainable params: 0
_________________________________________________________________


In [None]:
# train the model
#  validation_split = 0.1, use 10% of the data for cross validation
model.fit(x_train, y_train, batch_size = 16, epochs = 1)

   1/1125 [..............................] - ETA: 104:31:44 - loss: 0.6945 - acc: 0.3750

In [None]:
# make prediction
pred = model.predict(x_test)

In [None]:
# if the predicted value is >0.5 it is real else it is fake
prediction = []
for i in range(len(pred)):
    if pred[i].item() > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

In [None]:
# getting the accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(list(y_test), prediction)

print("Model Accuracy : ", accuracy)

### Transformer Library

In [3]:
from transformers import pipeline
import torch.nn.functional as F

In [None]:
from transformers.utils.dummy_pt_objects import AutoModelForSequenceClassification
model_name = AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)