In [1]:
#import libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Conv1D, SimpleRNN, Bidirectional, MaxPooling1D, GlobalMaxPool1D, LSTM, GRU
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout
from keras.layers import Dense, LSTM, Embedding
from tensorflow import keras
from tensorflow.keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from keras.preprocessing import sequence
from sklearn.utils.class_weight import compute_class_weight

In [2]:
# Load the JSON file into a pandas DataFrame
data = pd.read_json('./data/News_Category_Dataset_v3.json',lines=True)
data.head(5)

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
# #dropping unnecessary columns
new_df = data.drop(columns=['authors','link','date'])
new_df.head()

Unnamed: 0,headline,category,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...


In [4]:
# create final dataframe of combined headline and short_description
final_df = new_df.copy()
final_df['length_of_news'] = final_df['headline'] + final_df['short_description']
final_df.drop(['headline','short_description'], inplace=True, axis=1)
final_df['len_news'] = final_df['length_of_news'].apply(lambda x: len(str(x)))
final_df.head()


Unnamed: 0,category,length_of_news,len_news
0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,230
1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",248
2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,133
3,PARENTING,The Funniest Tweets From Parents This Week (Se...,215
4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,233


In [5]:
# final_df.drop(columns=['category'])
final_df.head()


Unnamed: 0,category,length_of_news,len_news
0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,230
1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",248
2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,133
3,PARENTING,The Funniest Tweets From Parents This Week (Se...,215
4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,233


In [6]:
# Preprocess the data
def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Split into words
    words = text.split()
    # Join the words
    preprocessed_text = ' '.join(words)
    return preprocessed_text

final_df['length_of_news'] = final_df['length_of_news'].apply(lambda x: preprocess_text(str(x)))


In [7]:
final_df['category_merged']=final_df['category'].replace({"HEALTHY LIVING": "WELLNESS",
"QUEER VOICES": "GROUPS VOICES",
"BUSINESS": "BUSINESS & FINANCES",
"PARENTS": "PARENTING",
"BLACK VOICES": "GROUPS VOICES",
"THE WORLDPOST": "WORLD NEWS",
"STYLE": "STYLE & BEAUTY",
"GREEN": "ENVIRONMENT",
"TASTE": "FOOD & DRINK",
"WORLDPOST": "WORLD NEWS",
"SCIENCE": "SCIENCE & TECH",
"TECH": "SCIENCE & TECH",
"MONEY": "BUSINESS & FINANCES",
"ARTS": "ARTS & CULTURE",
"COLLEGE": "EDUCATION",
"LATINO VOICES": "GROUPS VOICES",
"CULTURE & ARTS": "ARTS & CULTURE",
"FIFTY": "MISCELLANEOUS",
"GOOD NEWS": "MISCELLANEOUS"})

In [8]:
# one hot encoding using keras tokenizer and pad sequencing
X = final_df['length_of_news']
encoder = LabelEncoder()
y = encoder.fit_transform(final_df['category_merged'])
print("shape of input data: ", X.shape)
print("shape of target variable: ", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
tokenizer = Tokenizer(num_words=10000, oov_token='<00V>') 
tokenizer.fit_on_texts(X_train) # build the word index
# train_seq = tokenizer.texts_to_sequences(train_data)
# test_seq = tokenizer.texts_to_sequences(test_data)

print(type(X))

shape of input data:  (209527,)
shape of target variable:  (209527,)
<class 'pandas.core.series.Series'>


In [38]:
print(len(y))

209527


In [9]:
# padding X_train text input data
train_seq = tokenizer.texts_to_sequences(X_train) # converts strinfs into integer lists
train_padseq = pad_sequences(train_seq, maxlen=150) # pads the integer lists to 2D integer tensor 

# padding X_test text input data
test_seq = tokenizer.texts_to_sequences(X_test)
test_padseq = pad_sequences(test_seq, maxlen=150)
print(train_padseq.shape)



(167621, 150)


In [13]:
word_index = tokenizer.word_index
max_words = 50000  # total number of words to consider in embedding layer
total_words = len(word_index)
maxlen = 150 # max length of sequence 
num_classes = len(final_df['category_merged'].unique())
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

print("Length of word index:", total_words)

Length of word index: 158715


In [14]:
print(final_df['category_merged'].unique())


['U.S. NEWS' 'COMEDY' 'PARENTING' 'WORLD NEWS' 'ARTS & CULTURE'
 'SCIENCE & TECH' 'SPORTS' 'ENTERTAINMENT' 'POLITICS' 'WEIRD NEWS'
 'ENVIRONMENT' 'EDUCATION' 'CRIME' 'WELLNESS' 'BUSINESS & FINANCES'
 'STYLE & BEAUTY' 'FOOD & DRINK' 'MEDIA' 'GROUPS VOICES' 'HOME & LIVING'
 'WOMEN' 'TRAVEL' 'RELIGION' 'IMPACT' 'WEDDINGS' 'MISCELLANEOUS' 'DIVORCE']


In [15]:
print(np.shape(X_train))
print(y_train.shape)
from tensorflow.keras.layers import Input
# input_layer = Input(shape=(maxlen,))
input_layer = Input(shape=(20,))

(167621,)
(167621, 27)


In [48]:
from tensorflow.keras.layers import Reshape
# basline model using embedding layers and simpleRNN
model = Sequential()
model.add(Embedding(max_words, 70, input_length=maxlen))
model.add(Bidirectional(SimpleRNN(64, dropout=0.1, recurrent_dropout=0.20, activation='tanh', return_sequences=True)))
model.add(Bidirectional(SimpleRNN(64, dropout=0.1, recurrent_dropout=0.30, activation='tanh', return_sequences=True)))
model.add(SimpleRNN(32, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(28, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 70)           3500000   
                                                                 
 bidirectional (Bidirectiona  (None, 150, 128)         17280     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 150, 128)         24704     
 nal)                                                            
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                5152      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense (Dense)               (None, 28)                9

In [45]:
#Train the model
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy']
              )
# SETUP A EARLY STOPPING CALL and model check point API
earlystopping = keras.callbacks.EarlyStopping(monitor='accuracy',
                                              patience=5,
                                              verbose=1,
                                              mode='min'
                                              )
checkpointer = ModelCheckpoint(filepath='bestvalue',monitor='val_loss', verbose=0, save_best_only=True)
callback_list = [checkpointer, earlystopping]
# fit model to the data
history = model.fit(train_padseq, y_train, 
                     batch_size=128, 
                     epochs=10, 
                     validation_split=0.2,
                     shuffle=True
                    )

# evalute the model
test_loss, test_acc = model.evaluate(test_padseq, y_test, verbose=0)
print("test loss and accuracy:", test_loss, test_acc)




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
test loss and accuracy: 1.9307653903961182 0.49766144156455994


In [16]:
model_3 = Sequential()
model_3.add(Embedding(total_words, 100, input_length=maxlen))
model_3.add(Bidirectional(LSTM(128, dropout=0.1, recurrent_dropout=0.10, activation='tanh', return_sequences=True)))
model_3.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.20, activation='tanh', return_sequences=True)))
model_3.add(Bidirectional(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.20, activation='tanh', return_sequences=True)))
model_3.add(Conv1D(72, 3, activation='relu'))
model_3.add(MaxPooling1D(2))
model_3.add(SimpleRNN(64, activation='tanh', dropout=0.2, recurrent_dropout=0.20, return_sequences=True))
model_3.add(GRU(64, recurrent_dropout=0.20, recurrent_regularizer='l1_l2'))
model_3.add(Dropout(0.2))
model_3.add(Dense(27, activation='sigmoid'))
model_3.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 100)          15871500  
                                                                 
 bidirectional (Bidirectiona  (None, 150, 256)         234496    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 150, 256)         394240    
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 150, 128)         41088     
 nal)                                                            
                                                                 
 conv1d (Conv1D)             (None, 148, 72)           27720     
                                                      

In [17]:
model_3.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy']
              )
# SETUP A EARLY STOPPING CALL and model check point API
earlystopping = keras.callbacks.EarlyStopping(monitor='accuracy',
                                              patience=5,
                                              verbose=1,
                                              mode='min'
                                              )
checkpointer = ModelCheckpoint(filepath='bestvalue1',moniter='val_loss', verbose=0, save_best_only=True)
callback_list = [checkpointer, earlystopping]

# fit model to the data
history3 = model_3.fit(train_padseq, y_train, 
                     batch_size=128, 
                     epochs=20, 
                     validation_split=0.2,
                     shuffle=True
                    )

# evalute the model
test_loss3, test_acc3 = model_3.evaluate(test_padseq, y_test, verbose=0)
print("test loss and accuracy:", test_loss3, test_acc3)

Epoch 1/20
  12/1048 [..............................] - ETA: 15:59:51 - loss: 9.5152 - accuracy: 0.1348

In [26]:
import pickle

In [None]:
#traininggggggggggg.....

In [27]:
filename = './models/model.sav'
pickle.dump(model, open(filename, 'wb'))

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import normalize
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [8]:
import tensorflow as tf
model = tf.keras.models.load_model('./models/modelLastMayb.h5')

In [30]:
text_data="Google has announced the release of its new Pixel 7 tablet, which is set to hit the market in 2023. The tablet is designed to be super fast and responsive, with an adaptive battery and a range of helpful features. It also boasts Pixel's best photo and video capabilities, making it a great choice for anyone looking for a high-quality tablet. The Pixel 7 tablet was unveiled at Google's fall event, where the company also showed off its Pixel 7 and Pixel 7 Pro phones"

In [31]:
data=preprocess_text(text_data)

In [32]:
data = pd.Series(data)
data

0    google has announced the release of its new pi...
dtype: object

In [33]:
tokenizer = Tokenizer(num_words=10000, oov_token='<00V>') 
tokenizer.fit_on_texts(data)
train_seq = tokenizer.texts_to_sequences(data)
train_padseq = pad_sequences(train_seq, maxlen=130)
train_padseq

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,
        16, 17,  2, 18,  8,  9, 19,  3,  4, 20, 10, 21, 11, 22,  2, 23,
        24,  2,  4, 10, 25, 11, 26, 27, 28,  5, 29, 30, 31, 32, 33,  5,
         6, 34,  8, 35, 36, 12, 13, 37,  3, 14, 38, 39,  5, 40, 41, 42,
        12,  6, 43, 44, 15, 45, 46, 15,  6, 47, 48,  4,  2,  3,  4, 49,
        50, 51,  7, 14, 52, 53, 54,  2, 55, 13, 56, 57,  9,  3,  5,  3,
        58, 59]])

In [34]:
prediction=model.predict(train_padseq)



In [35]:
print(prediction)

[[3.2075366e-03 1.4590668e-03 2.5817575e-03 9.1638055e-04 6.9344585e-04
  1.4415936e-02 1.9356011e-03 4.0651558e-04 1.0345387e-03 6.3660160e-02
  1.4929048e-04 2.3221144e-01 7.4354769e-04 2.7101275e-02 1.4112805e-02
  3.5614066e-02 3.0220391e-02 5.5948342e-04 5.0797616e-04 3.7299382e-04
  1.6461028e-04 5.6560042e-05 4.3214939e-05 2.9646998e-04 1.9082578e-02
  3.6450870e-02 1.8990554e-03]]


In [36]:
import numpy as np
encoded_argmax  = np.argmax(prediction, axis=1)
# text = tokenizer.sequences_to_texts([encoded_argmax])

In [37]:
my_encoder = LabelEncoder()
y = my_encoder.fit_transform(final_df['category_merged'])

In [25]:
import pickle
with open('encoder.pkl', 'wb') as f:
    pickle.dump(my_encoder, f)

In [26]:
with open('encoder.pkl', 'rb') as f:
    encode = pickle.load(f)

In [38]:
text = my_encoder.inverse_transform(encoded_argmax)

In [39]:
print(text)

['IMPACT']
