## Data Loading

In [9]:
import pandas as pd


train_file_path = '/Users/prabaljitwalia/Downloads/AG-NEWS/train.csv'
test_file_path = '/Users/prabaljitwalia/Downloads/AG-NEWS/test.csv'

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

In [10]:
pip install tensorflow keras scikit-learn pandas numpy


Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install nlpaug


Note: you may need to restart the kernel to use updated packages.


In [12]:
train_df.head()


Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [14]:
test_df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


## Preprocessing

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

train_df['text'] = train_df['Title'] + ' ' + train_df['Description']
test_df['text'] = test_df['Title'] + ' ' + test_df['Description']

# Tokenization with N-grams
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['text'])
word_index = tokenizer.word_index


X_train_seq = tokenizer.texts_to_sequences(train_df['text'])
X_test_seq = tokenizer.texts_to_sequences(test_df['text'])

# Padding
X_train_padded = pad_sequences(X_train_seq, maxlen=500)
X_test_padded = pad_sequences(X_test_seq, maxlen=500)

# Prepare labels
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(train_df['Class Index'])
y_train = to_categorical(integer_encoded)
labels = to_categorical(integer_encoded)


# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_padded, y_train, test_size=0.2)



## Training

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense

VOCAB_SIZE = len(word_index) + 1
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 500

model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(LSTM(64))
model.add(Dense(labels.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 100)          7033800   
                                                                 
 conv1d_1 (Conv1D)           (None, 496, 64)           32064     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 99, 64)            0         
 g1D)                                                            
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 4)                 260       
                                                                 
Total params: 7099148 (27.08 MB)
Trainable params: 7099148 (27.08 MB)
Non-trainable params: 0 (0.00 Byte)
______________

In [24]:
history = model.fit(X_train, y_train, batch_size=32, epochs=3, validation_data=(X_val, y_val))


Epoch 1/3
Epoch 2/3
Epoch 3/3


## Evaluation

In [33]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

test_sequences = tokenizer.texts_to_sequences(test_df['text'])
X_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
y_test = to_categorical(label_encoder.transform(test_df['Class Index']))

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy*100:.2f}%, Test Loss: {test_loss:.4f}")

predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)

print("Confusion Matrix:")
print(confusion_matrix(true_classes, predicted_classes))

print("Classification Report:")
print(classification_report(true_classes, predicted_classes))


Test Accuracy: 90.11%, Test Loss: 0.2923
Confusion Matrix:
[[1696   60   97   47]
 [  25 1831   23   21]
 [  49   14 1724  113]
 [  69   15  219 1597]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.91      1900
           1       0.95      0.96      0.96      1900
           2       0.84      0.91      0.87      1900
           3       0.90      0.84      0.87      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600



## CoreML
this is a technique to transform our model to coreml acceptable format for the app (We tried to use this instead of building custom backend)

In [None]:
model.save('text_classification_model.h5')


  saving_api.save_model(


In [None]:
pip install coremltools

Note: you may need to restart the kernel to use updated packages.


In [None]:
import tensorflow as tf
import coremltools as ct

# Load the Keras model
keras_model = tf.keras.models.load_model('text_classification_model.h5')

# Convert to Core ML
coreml_model = ct.convert(keras_model, source='tensorflow')
coreml_model.save('TextClassification.mlpackage')


When both 'convert_to' and 'minimum_deployment_target' not specified, 'convert_to' is set to "mlprogram" and 'minimum_deployment_targer' is set to ct.target.iOS15 (which is same as ct.target.macOS12). Note: the model will not run on systems older than iOS15/macOS12/watchOS8/tvOS15. In order to make your model run on older system, please set the 'minimum_deployment_target' to iOS14/iOS13. Details please see the link: https://coremltools.readme.io/docs/unified-conversion-api#target-conversion-formats
Running TensorFlow Graph Passes: 100%|███████| 6/6 [00:00<00:00,  9.98 passes/s]
Converting TF Frontend ==> MIL Ops:   0%|              | 0/63 [00:00<?, ? ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Converting TF Frontend ==> MIL Ops: 100%|█| 14/14 [00:00<00:00, 126280.12 ops/s]
Input ls elem type unknown. Override with <class 'coremltools.converters.mil.mil.types.type_tensor.tensor.<locals>.tensor'>
Converting TF Frontend ==> MIL Ops: 100%|██| 41/41 

## Summarization

In [53]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import re

model_name = "t5-base"  
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  
    text = text.strip()  
    return text

def summarize_text(text, max_length=250, min_length=50, length_penalty=2.0, num_beams=4):
    text = clean_text(text) 
    input_text = "summarize: " + text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=length_penalty, num_beams=num_beams, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example 
example_text = """ Sam Altman is returning to OpenAI as its chief executive, the high-profile AI startup said Wednesday, a dramatic reversal that caps an intense five days of discussions, debates and convincing following the sudden dismissal of Altman last week from the startup he co-founded.
OpenAI, which is the most valuable U.S. startup, said it has reached an “agreement in principle” for Altman’s return. The startup is also reforming its board, eliminating several members who faced intense scrutiny for their decision last week.
Former Salesforce co-chief executive Bret Taylor, former U.S. Secretary of the Treasury Larry Summers, and Quora founder Adam D’Angelo will be part of the new board at the AI startup. Taylor will serve as the chair of the board, the startup said.
Microsoft, which has invested over $11 billion in OpenAI and owns about 49% of the startup, was taken aback by OpenAI’s decision last week and rushed to hire Altman to lead a new AI group at the software conglomerate. Greg Brockman, former President of OpenAI, and countless other members of the startup resigned in protest of the earlier OpenAI board’s decision. Brockman, who had also joined Microsoft, said he was also returning to the startup.
In response to OpenAI’s move Wednesday, Altman said: “I love OpenAI, and everything I’ve done over the past few days has been in service of keeping this team and its mission together. when I decided to join Microsoft on Sunday evening, it was clear that was the best path for me and the team. With the new board and with Satya’s support, I’m looking forward to returning to OpenAI, and building on our strong partnership with Microsoft.”
Microsoft chief Satya Nadella, who also expressed disappointment in OpenAI board’s decision last week and pledged to ensure that Microsoft would never be “surprised” again, said Wednesday that he was encouraged by today’s changes to the OpenAI board.
“We believe this is a first essential step on a path to more stable, well-informed, and effective governance. Sam, Greg, and I have talked and agreed they have a key role to play along with the OAI leadership team in ensuring OAI continues to thrive and build on its mission. We look forward to building on our strong partnership and delivering the value of this next generation of AI to our customers and partners.”
Nadella said in television interviews earlier this week that he had earlier relayed to the OpenAI board of directors that Microsoft will be working with Altman and Brockman “either way.” He also didn’t rule out the possibility of Altman and Brockman returning to OpenAI and said Microsoft will remain committed to the startup, which through its ChatGPT platform has captured the attention of the world in a way very few technologies have in the past.
OpenAI isn’t only widely estimated to be leading the current AI race but also has in less than a year assumed the position of kingmaker for thousands of other startups that are building atop its software offerings. Investment in OpenAI has also supercharged Microsoft’s AI efforts, helping it court many businesses and bolstering Wall Street’s positive outlook on Microsoft’s future.
OpenAI’s earlier board — which included its chief scientist Ilya Sutskever, independent directors D’Angelo, technology entrepreneur Tasha McCauley, and Georgetown Center for Security and Emerging Technology’s Helen Toner — faced intense public scrutiny for their abrupt decision, for which they never offered a comprehensive explanation. Growing frustrated with the earlier OpenAI board, several OpenAI investors began exploring options to sue the board members, Reuters reported Tuesday.
Joshua Kushner, founder of Thrive Capital, a backer of OpenAI, who had pushed for Altman’s return, said Wednesday that the startup has the “potential to be one of the most consequential companies in the history of computing.” Altman and Brockman “possess a profound commitment to the company’s integrity, and an unmatched ability to inspire and lead. We couldn’t be more excited for them to come back to the company they founded and helped build into what it is today.”
“The resilience and strength we have seen from the entire OpenAI team in the past few days has been extraordinary, and we consider it a true honor to be their partners now and in the future. We believe this is the best outcome for the company, its employees, those who build on their technologies, and the world at large.”
Emmett Shear, the former Twitch chief executive who was appointed as interim leader of OpenAI on Sunday, said he was pleased with OpenAI’s new decision. “Coming into OpenAI, I wasn’t sure what the right path would be. This was the pathway that maximized safety alongside doing right by all stakeholders involved. I’m glad to have been a part of the solution,” he posted on X.
"""
summary = summarize_text(example_text)
print("Summary:", summary)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Summary: openAI has reached an "agreement in principle" for Altman’s return. the startup is also reforming its board, eliminating members who faced scrutiny. former Salesforce co-chief executive Bret Taylor, former secretary of the u.s. Treasury Larry Summers and Quora founder Adam D’Angelo will be part of the new board.


## Testing raw input with the model

In [30]:
def preprocess_text(raw_text, tokenizer, max_sequence_length):
    sequence = tokenizer.texts_to_sequences([raw_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length)
    return padded_sequence


In [31]:
def predict_text_class(model, processed_text, label_encoder):
    prediction = model.predict(processed_text)
    predicted_class_index = np.argmax(prediction, axis=1)
    predicted_class = label_encoder.inverse_transform(predicted_class_index)
    return predicted_class


In [37]:
input_text = "Michigana-Ohio State: Wolverines outlast Buckeyes for third win in a row against rivals"

processed_text = preprocess_text(input_text, tokenizer, MAX_SEQUENCE_LENGTH)

predicted_class = predict_text_class(model, processed_text, label_encoder)
print("Predicted class:", predicted_class[0])


Predicted class: 2
