In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam

%run general_preprocessing.ipynb
# Assuming your dataset has been processed
# data_set = process_data(data_set)  # As you already did this
data_set = pd.read_csv('F:/MY_Projects/Deep_Learning_project/data/train.csv')
label_mapping = {
    "Politics": 0,
    "Sports": 1,
    "Media": 2,
    "Market & Economy": 3,
    "STEM": 4
}

# Apply the mapping to the label column
data_set['Category'] = data_set['Category'].map(label_mapping)
data_set = process_data(data_set)

X = data_set['Discussion']
y = data_set['Category']

# Step 1: Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Tokenize the text
tokenizer = Tokenizer(num_words=5000)  # Set the vocabulary size to 5000 words
tokenizer.fit_on_texts(X_train)

# Convert text to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Step 3: Pad the sequences to ensure uniform length
max_len = 100  # You can adjust this based on the average length of your text
X_train_pad = pad_sequences(X_train_seq, padding='post', maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, padding='post', maxlen=max_len)

# Step 4: Build the TextCNN model
model = Sequential()

# Embedding Layer: Converts words into word vectors
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))

# Convolutional Layer: Filters and captures local dependencies in the text
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

# MaxPooling Layer: Reduces the dimensionality by taking the max value over a pool of words
model.add(MaxPooling1D(pool_size=2))

# Add a second convolutional layer
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Global MaxPooling: Extracts the most important feature from the entire sequence
model.add(GlobalMaxPooling1D())

# Dense Layer: Fully connected layer for classification
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(Dense(5, activation='softmax'))  # 5 categories

# Step 5: Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 6: Train the model
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Step 7: Evaluate the model
test_loss, test_acc = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {test_acc:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_set['Discussion'] = lst


Epoch 1/5




[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 25ms/step - accuracy: 0.3875 - loss: 1.3612 - val_accuracy: 0.6475 - val_loss: 0.9115
Epoch 2/5
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 25ms/step - accuracy: 0.7199 - loss: 0.7705 - val_accuracy: 0.6744 - val_loss: 0.8682
Epoch 3/5
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.7846 - loss: 0.6032 - val_accuracy: 0.6667 - val_loss: 0.9457
Epoch 4/5
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 23ms/step - accuracy: 0.8323 - loss: 0.4582 - val_accuracy: 0.6513 - val_loss: 1.1082
Epoch 5/5
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 23ms/step - accuracy: 0.8729 - loss: 0.3472 - val_accuracy: 0.6434 - val_loss: 1.3571
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6481 - loss: 1.3546
Test Accuracy: 0.6434


In [2]:
def predict_and_save_as_numbers(model, tokenizer, input_csv, output_csv):
    """
    Predict categories for discussions in a CSV file and save SampleID with numeric labels.

    Parameters:
    - model: Trained TextCNN model.
    - tokenizer: Tokenizer used during training.
    - input_csv: Path to the input CSV file containing 'SampleID' and 'Discussion' columns.
    - output_csv: Path to save the output CSV file with SampleID and predicted numeric labels.
    """
    import pandas as pd
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    # Step 1: Load the input CSV file
    input_data = pd.read_csv(input_csv)

    # Step 2: Preprocess the Discussion column
    sample_ids = input_data['SampleID']  # Extract SampleID for output
    input_data = process_data(input_data)
    discussions = input_data['Discussion'].values
    sequences = tokenizer.texts_to_sequences(discussions)
    padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

    # Step 3: Predict labels using the model
    predictions = model.predict(padded_sequences)
    predicted_labels = predictions.argmax(axis=1)  # Get numeric labels

    # Step 4: Create a new DataFrame with SampleID and predicted labels
    output_data = pd.DataFrame({
        'SampleID': sample_ids,
        'Category': predicted_labels
    })

    # Step 5: Save the output DataFrame to a new CSV file
    output_data.to_csv(output_csv, index=False)
    print(f"Predictions saved to {output_csv}")


In [3]:
input_csv = "F:/MY_Projects/Deep_Learning_project/data/test.csv"  
output_csv = "F:/MY_Projects/Deep_Learning_project/data/TextCNN_predictions.csv"

predict_and_save_as_numbers(model, tokenizer, input_csv, output_csv)

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step
Predictions saved to F:/MY_Projects/Deep_Learning_project/data/TextCNN_predictions.csv
