In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense

In [3]:

# 1. Load your data
# Assuming a CSV structure: [headline, category]
df = pd.read_csv('data/Train.csv')
df.head()

Unnamed: 0,Id,Headline,Category
0,1,Breakthrough discovery in cyber research,Science
1,2,Government focuses on youth development programs,Society
2,3,health party wins majority in assembly elections,Politics
3,4,Startups in digital sector attract global inve...,Business
4,5,Sensex surges by technology points amid market...,Business


In [5]:
X_train_raw = df['Headline']
y_train_raw = df['Category']

In [6]:
# 2. Handle the Labels (Categories to Numbers)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)

In [None]:
# 3. Handle the Text (Headlines to TF-IDF Matrix)
# We limit to 5000 features to keep it fast on your CPU
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = tfidf.fit_transform(X_train_raw).toarray() # Convert sparse to dense for Keras

In [None]:
# 4. Build the Neural Network
## Design the model layers
model = Sequential([
    # Input shape is the number of columns in our TF-IDF matrix
    Input(shape=(X_train.shape[1],)),\
    Dense(128, activation='relu'),\
    Dense(64, activation='relu'),\
    # Output layer size = number of unique categories
    Dense(len(label_encoder.classes_), activation='softmax')
])
## Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [13]:
# 5. Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8931 - loss: 1.1157 - val_accuracy: 1.0000 - val_loss: 0.0454
Epoch 2/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0167 - val_accuracy: 1.0000 - val_loss: 0.0066
Epoch 3/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0043 - val_accuracy: 1.0000 - val_loss: 0.0027
Epoch 4/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0021 - val_accuracy: 1.0000 - val_loss: 0.0015
Epoch 5/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0012 - val_accuracy: 1.0000 - val_loss: 9.5639e-04
Epoch 6/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 8.0095e-04 - val_accuracy: 1.0000 - val_loss: 6.5514e-04
Epoch 7/10
[1m1

<keras.src.callbacks.history.History at 0x79fbf0243390>

In [14]:
# 6 Handle the Text (Headlines to TF-IDF Matrix)
X_test_raw = pd.read_csv('data/Test.csv')['Headline']
X_test = tfidf.transform(X_test_raw).toarray()

In [20]:
# 7. Predict the categories of the New Headlines
predictions = model.predict(X_test)
## Convert the numeric labels into names using labelencoder (Decoding) and assign the predictions with the highest probability
predictions = label_encoder.inverse_transform(predictions.argmax(axis=1))

[1m 1/63[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 14ms/step

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 983us/step


In [18]:
print(predictions)

['Business' 'Politics' 'Education' ... 'Technology' 'Science' 'Sports']


In [21]:
# 8. Bring X-test_raw and predictions together as a pandas dataframe
results_df = pd.DataFrame({'Headline': X_test_raw, 'Predicted Category': predictions})
print(results_df)

                                               Headline Predicted Category
0              RBI announces new women policy framework           Business
1               PM inaugurates climate project in Delhi           Politics
2     Government launches finance scholarship for st...          Education
3      WHO praises Indiaâ€™s efforts in combating space             Health
4     Indian music festival in sports draws massive ...      Entertainment
...                                                 ...                ...
1995    Indian startup develops river app for education         Technology
1996      CSIR scientists discover new finance compound            Science
1997  Infosys partners with digital for technology a...         Technology
1998       CSIR scientists discover new health compound            Science
1999  Indian womenâ€™s team secures victory in women...             Sports

[2000 rows x 2 columns]
