In [None]:
pip install nltk



In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
training_set = pd.read_csv('balanced_train.csv')

In [None]:
def filtered_cmnt(comment):
    comment = comment.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(comment)
    filtered_words = [word for word in words if word.lower() not in combined_stopwords and word.isalpha()]
    return ' '.join(filtered_words)

# Apply preprocessing to the comments
training_set['filtered-comment'] = training_set['comment'].apply(filtered_cmnt)


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# pip install sentencepiece

In [None]:
import sentencepiece as spm

# Train the SentencePiece model using your file 'data.txt'
spm.SentencePieceTrainer.Train('--input=data.txt --model_prefix=m --vocab_size=32000')

In [None]:
import pandas as pd
import sentencepiece as spm

In [None]:

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor(model_file='m.model')

# Load your CSV file
df = pd.read_csv('balanced_train.csv')  # Replace with your actual file path

# Assume your CSV has a 'comment' column that contains sentences
comments = df['comment']  # Extract the 'comment' column

# Define a function to tokenize each sentence using SentencePiece
def tokenize_sentence(sentence):
    return sp.encode_as_pieces(sentence)  # Tokenize each sentence into subwords

# Apply the tokenization function to each sentence in the 'comment' column
df['tokenized_comment'] = comments.apply(tokenize_sentence)

# Now, df['tokenized_comment'] contains the tokenized sentences
print(df[['comment', 'tokenized_comment']].head())  # Display a sample of the original and tokenized comments

# Optionally, save the new DataFrame to a CSV file
df.to_csv('tokenized_comments.csv', index=False)


                                             comment  \
0                                        jai mata de   
1  that was a very helpful lecture your teaching ...   
2       sir one quistion please of motion in a plane   
3  euglena and chlamydomonas like organisms give ...   
4                         last wala answer galat hai   

                                   tokenized_comment  
0                                 [▁jai, ▁mata, ▁de]  
1  [▁that, ▁was, ▁a, ▁very, ▁helpful, ▁lecture, ▁...  
2  [▁sir, ▁one, ▁qui, stion, ▁please, ▁of, ▁motio...  
3  [▁euglen, a, ▁and, ▁chlamydomonas, ▁like, ▁org...  
4              [▁last, ▁wala, ▁answer, ▁galat, ▁hai]  


In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Assuming df['tokenized_comment'] contains tokenized comments as lists of tokens
def tokenize_and_pad(comments, sp, max_len=30):
    # Convert tokenized comments into token IDs
    token_ids = [sp.encode_as_ids(comment) for comment in comments]

    # Pad the sequences to ensure uniform length
    padded_ids = pad_sequences(token_ids, maxlen=max_len, padding='post')

    return np.array(padded_ids)

# Convert and pad the tokenized comments
X = tokenize_and_pad(df['comment'], sp)


In [None]:
# Define a custom mapping for labels
label_mapping = {
    'doubt': 0,
    'feedback': 1,
    'irrelevant': -1
}

# Map the labels to their numerical values
df['label'] = df['label'].map(label_mapping)

# Check for any labels that couldn't be mapped
if df['label'].isnull().any():
    print("Warning: Some labels could not be mapped. Please check your dataset.")
    print(df[df['label'].isnull()])


In [None]:
y = df['label'].values  # Extract the mapped labels

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X.shape

(165726, 30)

In [None]:

# Check the shape of X_train and y_train
print("Shape of X_train:", X_train.shape)  # Should be (num_train_samples, 50)
print("Shape of y_train:", y_train.shape)   # Should be (num_train_samples,)

# Display sample data from X_train and y_train
print("Sample from X_train:", X_train[0])   # Display the first training sample
print("Corresponding label in y_train:", y_train[0])  # Display the corresponding label


Shape of X_train: (132580, 30)
Shape of y_train: (132580,)
Sample from X_train: [   44  1076    65 17352  2100   136  1207   756   363    24     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0]
Corresponding label in y_train: -1


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Flatten
from tensorflow.keras.utils import to_categorical

In [None]:
vocab_size = len(sp)  # Size of the vocabulary from SentencePiece
embedding_dim = 128    # Choose an appropriate embedding dimension (e.g., 128)

In [None]:
# Adjust the labels for compatibility with sparse_categorical_crossentropy
y_train_adjusted = np.where(y_train == -1, 2, y_train)

In [None]:
# Define model parameters

# Build the model
model = Sequential()
# Add the embedding layer with the input length specified
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=50))
model.add(LSTM(128, return_sequences=False))  # LSTM layer
model.add(Dropout(0.5))                   # Dropout layer for regularization
# Flatten the output to feed into ANN

model.add(Dense(3, activation='softmax'))    # Output layer for 3 classes

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [None]:
history = model.fit(X_train, y_train_adjusted, epochs=4, batch_size=64, validation_split=0.2)


Epoch 1/4
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 133ms/step - accuracy: 0.5842 - loss: 0.8605 - val_accuracy: 0.6953 - val_loss: 0.7006
Epoch 2/4
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 138ms/step - accuracy: 0.7271 - loss: 0.6459 - val_accuracy: 0.6919 - val_loss: 0.7126
Epoch 3/4
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 132ms/step - accuracy: 0.7588 - loss: 0.5774 - val_accuracy: 0.6920 - val_loss: 0.7121
Epoch 4/4
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 130ms/step - accuracy: 0.7777 - loss: 0.5207 - val_accuracy: 0.6879 - val_loss: 0.7684


In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences

# Define your custom mapping for labels
label_mapping = {
    0: 'doubt',
    1: 'feedback',
    2: 'irrelevant'  # Adjusted label for -1 during training
}

# Load your test CSV file
test_df = pd.read_csv('test.csv')  # Replace with your actual test file path

# Tokenize the comments in the test set
test_comments = test_df['comment']
test_df['tokenized_comment'] = test_comments.apply(lambda x: sp.encode_as_ids(x))

# Pad the sequences to have the same length
max_length = 50  # Define the maximum length (use the same as your training set)
X_test = pad_sequences(test_df['tokenized_comment'].tolist(), maxlen=max_length, padding='post')

# Make predictions using your trained model
predicted_labels = model.predict(X_test)

# Get the predicted class indices (0, 1, or 2)
predicted_indices = np.argmax(predicted_labels, axis=1)

# Convert numerical predictions back to string labels
predicted_strings = [label_mapping[label] for label in predicted_indices]

# Prepare the DataFrame to save predictions
output_df = pd.DataFrame({
    'id': test_df['id'],  # Assuming 'id' is a column in your test set
    'label': predicted_strings
})

# Save the predictions to a CSV file
output_df.to_csv('thakgayahoon02.csv', index=False)

# Optionally, display the predictions
print(output_df.head())  # Show the first few predictions


[1m2863/2863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 34ms/step
   id       label
0   0  irrelevant
1   1  irrelevant
2   2    feedback
3   3    feedback
4   4       doubt
