#**Task 1: Data Exploration and Preprocessing**

**Step 1.1: Load the Dataset**

In [15]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

data = pd.read_csv("/content/Document_Classifier_Dataset(Document_Classifier_Dataset).csv")

# Display the first few rows to understand the structure
print(data.head())
print(data.info())

   ID                                               Text    Category
0   1  Abstract: Wish up music want go prove happy. V...  Scientific
1   2  This agreement is made between Grimes, Kaufman...       Legal
2   3  I recently purchased land and it exceeded my e...  E-commerce
3   4  This agreement is made between Brown PLC and V...       Legal
4   5  I recently purchased wrong and it was disappoi...  E-commerce
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        3000 non-null   int64 
 1   Text      3000 non-null   object
 2   Category  3000 non-null   object
dtypes: int64(1), object(2)
memory usage: 70.4+ KB
None


**Step 1.2: Understand the Dataset**
From the information given:

The dataset contains three columns: ID, Text, and Category.

Text is the feature column containing the document content.

Category is the label column with predefined categories (e.g., News Articles, Scientific Papers, Legal Documents, etc.).

**Step 1.3: Text Preprocessing**

Tokenization: Splitting text into individual words.

Stop-word Removal: Removing common words like “the,” “is,” etc.

Lemmatization: Reducing words to their base forms (e.g., “running” → “run”).

Vectorization: Converting text into numerical format using TF-IDF.

In [16]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    # Remove non-alphabetic characters and convert text to lowercase
    text = re.sub(r'\W', ' ', text.lower())
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Tokenize, remove stop words, and lemmatize
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing to the Text column
data['Processed_Text'] = data['Text'].apply(preprocess_text)

# Display a sample of the preprocessed text
print("Sample Preprocessed Text:", data['Processed_Text'].iloc[0])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sample Preprocessed Text: abstract wish music want go prove happy various information begin wear decision speech attention plan building mission building collection speak difference worry approach source rock business side green structure section wish network remember material none particularly keywords thus join fish scene national


**Step 1.4: Vectorization**

Use TF-IDF to convert text into numerical features.

In [17]:
# Vectorize the preprocessed text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
X = vectorizer.fit_transform(data['Processed_Text']).toarray()

# Encode the labels
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(data['Category'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (2400, 1685)
Testing data shape: (600, 1685)


#**Task 2: Model Development**

**Step 2.1: Train Logistic Regression**

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=100)
lr_model.fit(X_train, y_train)

# Evaluate Logistic Regression
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Evaluation:\n", classification_report(y_test, y_pred_lr))


Logistic Regression Evaluation:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       118
           1       1.00      1.00      1.00       119
           2       1.00      1.00      1.00       131
           3       1.00      0.98      0.99       131
           4       1.00      1.00      1.00       101

    accuracy                           0.99       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      0.99      1.00       600



**Step 2.2: Train LSTM Model**


In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import numpy as np

# Parameters for LSTM
vocab_size = 5000  # Number of unique tokens in the vocabulary
embed_dim = 128    # Embedding dimension size
max_length = X.shape[1]  # Number of timesteps (matches the number of features from TF-IDF)

# Convert data to the correct shape (batch_size, timesteps)
X_train_lstm = X_train  # Already in 2D from TF-IDF
X_test_lstm = X_test    # Already in 2D from TF-IDF

# Build LSTM model
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_length),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),  # LSTM layer with 128 units
    Dropout(0.5),
    Dense(len(encoder.classes_), activation='softmax')  # Output layer for classification
])

# Compile the model
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(lstm_model.summary())

# Train LSTM
lstm_model.fit(X_train_lstm, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate LSTM
y_pred_lstm = lstm_model.predict(X_test_lstm).argmax(axis=1)
print("LSTM Evaluation:\n", classification_report(y_test, y_pred_lstm))


None
Epoch 1/5
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 4s/step - accuracy: 0.2133 - loss: 1.6147 - val_accuracy: 0.1958 - val_loss: 1.6133
Epoch 2/5
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 4s/step - accuracy: 0.1896 - loss: 1.6116 - val_accuracy: 0.2375 - val_loss: 1.6118
Epoch 3/5
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 4s/step - accuracy: 0.2113 - loss: 1.6097 - val_accuracy: 0.1958 - val_loss: 1.6161
Epoch 4/5
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 4s/step - accuracy: 0.2113 - loss: 1.6062 - val_accuracy: 0.2375 - val_loss: 1.6081
Epoch 5/5
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 5s/step - accuracy: 0.1973 - loss: 1.6107 - val_accuracy: 0.1958 - val_loss: 1.6107
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 958ms/step
LSTM Evaluation:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Task 3: Evaluation**

To evaluate both the Logistic Regression and LSTM models, you need to use metrics such as accuracy, precision, recall, and F1-score. These metrics help in understanding the performance of each model on the test dataset.



**Step 3.1: Import Required Libraries**

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

**`Step 3.2: Evaluate Logistic Regression`**

In [22]:
# Predictions for Logistic Regression
y_pred_lr = lr_model.predict(X_test)

# Calculate metrics for Logistic Regression
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_lr = f1_score(y_test, y_pred_lr, average='weighted')

print("Logistic Regression Evaluation:")
print(f"Accuracy: {accuracy_lr}")
print(f"Precision: {precision_lr}")
print(f"Recall: {recall_lr}")
print(f"F1-Score: {f1_lr}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


Logistic Regression Evaluation:
Accuracy: 0.995
Precision: 0.9951239669421489
Recall: 0.995
F1-Score: 0.9950024232241806

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       118
           1       1.00      1.00      1.00       119
           2       1.00      1.00      1.00       131
           3       1.00      0.98      0.99       131
           4       1.00      1.00      1.00       101

    accuracy                           0.99       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      0.99      1.00       600



**Step 3.3: Evaluate LSTM**

In [23]:
# Predictions for LSTM
y_pred_lstm = lstm_model.predict(X_test_lstm).argmax(axis=1)

# Calculate metrics for LSTM
accuracy_lstm = accuracy_score(y_test, y_pred_lstm)
precision_lstm = precision_score(y_test, y_pred_lstm, average='weighted')
recall_lstm = recall_score(y_test, y_pred_lstm, average='weighted')
f1_lstm = f1_score(y_test, y_pred_lstm, average='weighted')

print("LSTM Evaluation:")
print(f"Accuracy: {accuracy_lstm}")
print(f"Precision: {precision_lstm}")
print(f"Recall: {recall_lstm}")
print(f"F1-Score: {f1_lstm}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_lstm))


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1s/step
LSTM Evaluation:
Accuracy: 0.21833333333333332
Precision: 0.047669444444444445
Recall: 0.21833333333333332
F1-Score: 0.07825353397172824

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       118
           1       0.00      0.00      0.00       119
           2       0.22      1.00      0.36       131
           3       0.00      0.00      0.00       131
           4       0.00      0.00      0.00       101

    accuracy                           0.22       600
   macro avg       0.04      0.20      0.07       600
weighted avg       0.05      0.22      0.08       600



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Step 3.4: Compare Results**

Create a comparison table to summarize the performance of both models.

In [24]:
# Print comparison
print("Comparison of Models:")
print(f"{'Metric':<15}{'Logistic Regression':<20}{'LSTM':<20}")
print(f"{'Accuracy':<15}{accuracy_lr:<20.2f}{accuracy_lstm:<20.2f}")
print(f"{'Precision':<15}{precision_lr:<20.2f}{precision_lstm:<20.2f}")
print(f"{'Recall':<15}{recall_lr:<20.2f}{recall_lstm:<20.2f}")
print(f"{'F1-Score':<15}{f1_lr:<20.2f}{f1_lstm:<20.2f}")

Comparison of Models:
Metric         Logistic Regression LSTM                
Accuracy       0.99                0.22                
Precision      1.00                0.05                
Recall         0.99                0.22                
F1-Score       1.00                0.08                


# **Task 4: Optimization**

**Step 4.1: Optimize Logistic Regression (Grid Search)**

Grid Search systematically tests combinations of hyperparameters to find the best-performing configuration. For Logistic Regression, we can optimize:

C: Regularization strength.

solver: Optimization algorithm.

In [26]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs']  # Optimization solvers
}

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=LogisticRegression(max_iter=200),
    param_grid=param_grid,
    scoring='f1_weighted',  # Optimizing for F1-score
    cv=3  # 3-fold cross-validation
)

# Perform the search
grid_search.fit(X_train, y_train)

# Best hyperparameters and corresponding score
print("Best Hyperparameters for Logistic Regression:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)

# Use the best model for predictions
best_lr_model = grid_search.best_estimator_
y_pred_lr_optimized = best_lr_model.predict(X_test)


Best Hyperparameters for Logistic Regression: {'C': 100, 'solver': 'liblinear'}
Best F1-Score: 0.9983338389889505
