In [12]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import keras
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, Dense, Dropout, Dense, LSTM
from tensorflow.keras.losses import CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Adagrad, Adadelta
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("last.csv",encoding="utf-8")
df.head()

Unnamed: 0,category,title
0,Advertisement,poster quot literari quot art print minimalist...
1,Advertisement,gtse 100 brown plastic masonri wall plug 70mm ...
2,Advertisement,barn swallow graphit print basebal cap men wom...
3,Advertisement,women fromi flipflop
4,Advertisement,320 pc heat shrink spade connector femal male ...


In [17]:
#things that have implemented to the dataset : lower(), remove punct, stopword removal...
# stopword removal have not imp. to class named Friends & Family.

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [3]:
X = df["title"].astype(str).tolist()
y = df["category"].tolist()

In [4]:
lencoder = LabelEncoder()
y = lencoder.fit_transform(y)

print(lencoder.classes_)

['Advertisement' 'News' 'Work']


In [5]:
for i, category in enumerate(lencoder.classes_):
    print(f"{category} -> {i}")

Advertisement -> 0
News -> 1
Work -> 2


In [6]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_padd = pad_sequences(X_seq, maxlen=45, padding="post",truncating="post")

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_padd, y, test_size=0.2, random_state=10,shuffle=True)
X_train = X_train[~np.isnan(X_train).any(axis=1)]
X_test = X_test[~np.isnan(X_test).any(axis=1)]


In [8]:
import numpy as np
import pandas as pd
from collections import Counter

train_counts = Counter(y_train)
test_counts = Counter(y_test)

train_df = pd.DataFrame.from_dict(train_counts, orient='index', columns=['Train Count'])
test_df = pd.DataFrame.from_dict(test_counts, orient='index', columns=['Test Count'])

category_distribution = pd.concat([train_df, test_df], axis=1).fillna(0)
category_distribution.index.name = "Category"

print(category_distribution)


          Train Count  Test Count
Category                         
0               39922        9812
2               39691       10043
1               39748        9986


In [26]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=45),
    Bidirectional(LSTM(128,  return_sequences=True, kernel_regularizer=regularizers.l2(0.001))),
    Dropout(0.5),
    Bidirectional(LSTM(64,kernel_regularizer=regularizers.l2(0.001))),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(len(set(y)),activation="softmax")
])


In [27]:
model.compile(loss="SparseCategoricalCrossentropy", optimizer=Adam(1e-5), metrics=['accuracy'])

In [28]:
history = model.fit(X_train, 
                    y_train, 
                    epochs=15,
                    validation_data=(X_test, y_test),
                    batch_size=32,
                    verbose=1) 

Epoch 1/15
[1m3731/3731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 31ms/step - accuracy: 0.5196 - loss: 1.6121 - val_accuracy: 0.7667 - val_loss: 0.9938
Epoch 2/15
[1m3731/3731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 32ms/step - accuracy: 0.7929 - loss: 0.8986 - val_accuracy: 0.9048 - val_loss: 0.5291
Epoch 3/15
[1m3731/3731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 31ms/step - accuracy: 0.8985 - loss: 0.5623 - val_accuracy: 0.9264 - val_loss: 0.4264
Epoch 4/15
[1m3731/3731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 32ms/step - accuracy: 0.9220 - loss: 0.4597 - val_accuracy: 0.9383 - val_loss: 0.3652
Epoch 5/15
[1m3731/3731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 32ms/step - accuracy: 0.9363 - loss: 0.3945 - val_accuracy: 0.9476 - val_loss: 0.3270
Epoch 6/15
[1m3731/3731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 32ms/step - accuracy: 0.9463 - loss: 0.3470 - val_accuracy: 0.9539 - val_loss: 0.293

In [29]:
import keras.saving

keras.saving.save_model(model, 'doksanyedi-ondokuz.keras')

In [30]:
from tensorflow.keras.models import load_model

model1 = load_model("doksanyedi-ondokuz.keras")

In [50]:
from nltk.corpus import stopwords
import string

translator = str.maketrans('', '', string.punctuation)
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.lower().translate(translator).split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)


def process_input_text(text):
    text = remove_stopwords(text)
    text_sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(text_sequence, maxlen=45, padding="post",truncating="post")
    return padded_sequence

input_texts = ["Google's artificial intelligence course is available with a %50 discount.","The oldest living fish species has been discovered","send me the Java files - lead"]

for input_text in input_texts:
    processed_input = process_input_text(input_text)

    prediction = model.predict(processed_input)

    predicted_category = prediction.argmax()
    print(f"Tahmin edilen kategori: {predicted_category}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Tahmin edilen kategori: 0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Tahmin edilen kategori: 1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Tahmin edilen kategori: 2
