In [32]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import keras
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, Dense, Dropout, Dense, LSTM
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Adagrad, Adadelta
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [12]:
df = pd.read_csv("dataset_balanced.csv",encoding="utf-8")
df.head()

Unnamed: 0,category,title
0,News,censor board denied certification 77 films 201516
1,News,swedish church tweets gif jesus craving tacos
2,News,50 indigo staffers go strike varanasi airport
3,News,flu curable cong leaders mental illness goyal
4,News,condom brand slammed design appearing encourag...


In [19]:
#things that have implemented to the dataset : lower(), remove punct, stopword removal...
# stopword removal have not imp. to class named Friends & Family.

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [14]:
X = df["title"].astype(str).tolist()
y = df["category"].tolist()

In [15]:
lencoder = LabelEncoder()
y = lencoder.fit_transform(y)

print(lencoder.classes_)

['Advertisement' 'Family & Friends' 'News' 'Work']


In [17]:
for i, category in enumerate(lencoder.classes_):
    print(f"{category} -> {i}")

Advertisement -> 0
Family & Friends -> 1
News -> 2
Work -> 3


In [21]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_padd = pad_sequences(X_seq, maxlen=45, padding="post",truncating="post")

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_padd, y, test_size=0.2, random_state=10,shuffle=True)

In [23]:
import numpy as np
import pandas as pd
from collections import Counter

train_counts = Counter(y_train)
test_counts = Counter(y_test)

train_df = pd.DataFrame.from_dict(train_counts, orient='index', columns=['Train Count'])
test_df = pd.DataFrame.from_dict(test_counts, orient='index', columns=['Test Count'])

category_distribution = pd.concat([train_df, test_df], axis=1).fillna(0)
category_distribution.index.name = "Category"

print(category_distribution)


          Train Count  Test Count
Category                         
1               58133       14459
0               56878       14145
3               46605       11654
2               55883       14117


In [31]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=45),
    Bidirectional(LSTM(128,  return_sequences=True)),
    Bidirectional(LSTM(64)),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(len(set(y)),activation="softmax")
])


In [33]:
model.compile(loss="CategoricalCrossentropy", optimizer=Adam(1e-4), metrics=['accuracy'])

In [None]:
history = model.fit(X_train, 
                    epochs=128,
                    validation_data=X_test,
                    batch_size = 32,
                    validation_steps=30)