In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv("train.txt",delimiter=';',names=['sentence','label'])
df.head()

In [None]:
df.value_counts('label')

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['label']=le.fit_transform(df['label'])
df.head()

In [None]:
# prompt: create class weights for labels

from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df['label']),
    y=df['label']
)

class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict)


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

In [None]:
tokanizer=Tokenizer(num_words=10000)
tokanizer.fit_on_texts(df['sentence'])
sequences=tokanizer.texts_to_sequences(df['sentence'])


In [None]:
len(tokanizer.word_counts)

In [None]:
tr_df=pad_sequences(sequences,maxlen=100,padding='post')
tr_y=to_categorical(df['label'])

In [None]:
vocab_size=len(tokanizer.word_index)+1
embedding_dim=128
max_length=100
num_classes=6

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout,Bidirectional,GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=max_length))
model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(GlobalAveragePooling1D())
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes,activation='softmax'))

In [None]:
model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=max_length))
model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64, return_sequences=True))) # Added return_sequences=True
model.add(GlobalAveragePooling1D())
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes,activation='softmax'))

In [None]:
# prompt: make combile and use class_weight

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(tr_df, tr_y, epochs=10, batch_size=32, validation_split=0.2, class_weight=class_weights_dict, callbacks=[early_stopping])


In [None]:
# prompt: read test.txt and split the sentence and label then convert label by labelencoder and predict the sentense label then get the accurecy

df_test = pd.read_csv("test.txt", delimiter=';', names=['sentence', 'label'])
df_test['label'] = le.transform(df_test['label'])
test_sequences = tokanizer.texts_to_sequences(df_test['sentence'])
test_df = pad_sequences(test_sequences, maxlen=100, padding='post')
test_y = to_categorical(df_test['label'])

loss, accuracy = model.evaluate(test_df, test_y)
print(f"Test Accuracy: {accuracy}")

y_pred = model.predict(test_df)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(test_y, axis=1)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true_classes, y_pred_classes)
print(f"Test Accuracy: {accuracy}")


In [None]:
# prompt: read test.txt and split the sentence and label then convert label by labelencoder and predict the sentense label then get the accurecy

df_test = pd.read_csv("val.txt", delimiter=';', names=['sentence', 'label'])
df_test['label'] = le.transform(df_test['label'])
test_sequences = tokanizer.texts_to_sequences(df_test['sentence'])
test_df = pad_sequences(test_sequences, maxlen=100, padding='post')
test_y = to_categorical(df_test['label'])

loss, accuracy = model.evaluate(test_df, test_y)
print(f"Test Accuracy: {accuracy}")

y_pred = model.predict(test_df)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(test_y, axis=1)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true_classes, y_pred_classes)
print(f"Test Accuracy: {accuracy}")


In [None]:
# prompt: download the model and load it again

# Save the model
model.save('my_model.h5')

# Download the model file

# Now you can use loaded_model for predictions or further training


In [None]:
from google.colab import files
files.download('my_model.h5')

# Load the model
from tensorflow.keras.models import load_model
loaded_model = load_model('my_model.h5')
