In [2]:
import json
import numpy as np
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
# loading the data
def load_data(path):
    df = pd.read_csv(path)
    return df
# remove punctuations
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    for p in string.punctuation:
        text = text.replace(p, "")  
    text = re.sub(r"\d+", "", text)  # remove digits
    return text

def combine_text_columns(df, text_columns):
    combined_texts = df[text_columns[0]].astype(str)
    for col in text_columns[1:]:
        combined_texts += " " + df[col].astype(str)
    return combined_texts
 # Vectorizing the data and also Adjusting the max_features as needed
def vectorize_texts(texts):
    vectorizer = TfidfVectorizer(max_features=10000) 
    X = vectorizer.fit_transform(texts)
    return X, vectorizer
# Model for News Article Categorization
def build_model(input_dim, num_classes):
    model = Sequential()
    model.add(Dense(256, input_shape=(input_dim,), activation='relu'))
    model.add(Dropout(0.5))  # Add dropout to reduce overfitting
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))  # Add dropout to reduce overfitting
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def evaluate_model(model, X_test, y_test):
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

def predict_category(model, vectorizer, texts, label_encoder):
    X = vectorizer.transform(texts).toarray()
    predictions = model.predict(X)
    predicted_labels = label_encoder.inverse_transform(predictions.argmax(axis=1))
    return predicted_labels

def main():
    # Load the data
    df = load_data('Task1/news-article-categories.csv')

    # Specify the text columns and label column
    text_columns = ['title', 'body']
    label_column = 'category' 
    df['title'] = df["title"].apply(preprocess_text)
    # Preprocess the text data
    for col in text_columns:
        df[col] = df[col].apply(preprocess_text)

    # Combine the text columns
    combined_texts = combine_text_columns(df, text_columns)

    # Vectorize the texts using TF-IDF
    X, vectorizer = vectorize_texts(combined_texts)

    # Encode labels
    label_encoder = LabelEncoder()
    labels = df[label_column].apply(preprocess_text) 
    y = label_encoder.fit_transform(labels)
    y = to_categorical(y)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert sparse matrices to dense
    X_train = X_train.toarray()
    X_test = X_test.toarray()

    # Build and train model
    model = build_model(X_train.shape[1], y_train.shape[1])
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

    # Evaluate model
    evaluate_model(model, X_test, y_test)

    # CLI for predicting categories
    while True:
        input_text = input("Enter news article text (or type 'exit' to quit): ")
        if input_text.lower() == 'exit':
            break
        predicted_category = predict_category(model, vectorizer, [input_text], label_encoder)
        print(json.dumps({'text': input_text, 'category': predicted_category[0]}))

if __name__ == '__main__':
    main()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 82.49%
{"text": "With the presentation of the interim budget on February 1", "category": "entertainment"}
{"text": "Desai, as finance minister, had presented five annual budgets and one interim budget between 1959-1964", "category": "arts  culture"}
{"text": "ATMs to become virtual bank branches, accept deposits with instant credit", "category": "business"}
{"text": "Allan Border opens up about Parkinson\u2019s disease: \u2018I\u2019m not scared, but I am worried about the slow decline process\u2019\",\"Former Australia captain says the disease has softened him a bit, which is embarrassing and good at the same time.", "category": "comedy"}
{"text": "Unlocking the science of E Ink displays: Why we believe they must catch on", "category": "science"}
