# NLP Task1: News Category Prediction

 For News Category Prediction I've implemented 3 models

*   Logistic Regression
*   Naive Bayes
*   SVM classifier



In [2]:
# !pip install transformers

In [3]:
# import libraries:

import json
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf

In [4]:
# Path to the JSON file
json_file = '/content/sample_data/News_Category_Dataset_v3.json'

# Read the JSON file and process each JSON object manually
data = []
with open(json_file, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            json_obj = json.loads(line)
            data.append(json_obj)
        except json.JSONDecodeError:
            # Handle JSON decoding errors
            pass

# Convert the list of JSON objects to a DataFrame
data = pd.json_normalize(data)
data.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


Data Pre-processing

In [5]:
data.shape

(52196, 6)

In [6]:
data.columns

Index(['link', 'headline', 'category', 'short_description', 'authors', 'date'], dtype='object')

In [7]:
data['category'].nunique()

41

Feature Selection

In [8]:
# Select required columns
data = data[['headline', 'category']]

## Logistic Regression

In [24]:
# Perform train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [25]:
# Feature extraction and transformation
vectorizer = TfidfVectorizer(max_features=10000)  # Limit the maximum number of features
X_train = vectorizer.fit_transform(train_data['headline'])
X_test = vectorizer.transform(test_data['headline'])
y_train = train_data['category']
y_test = test_data['category']

In [26]:
# Build and train the logistic regression model
model = LogisticRegression(max_iter=1000)  # Increase the maximum number of iterations
model.fit(X_train, y_train)

In [27]:
# Make predictions on the testing data
y_pred = model.predict(X_test)

In [28]:
# Actual and predicted Values (Validation Set)
results= test_data.copy()
results['Predicted'] = y_pred
# results

In [29]:
# Calculate accuracy using accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:{:.2f}%".format(accuracy*100))

Model Accuracy:55.71%


## Naive Baies

In [None]:
# Perform train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Tokenization and feature extraction
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_data['headline'])
test_features = vectorizer.transform(test_data['headline'])

In [None]:
# Encoding the labels
label_list = train_data['category'].unique().tolist()
label_map = {label: i for i, label in enumerate(label_list)}

# Function to handle missing labels
def get_label(label):
    try:
        return label_map[label]
    except KeyError:
        return -1  # Assign a default label or handle as needed

train_labels = np.array([get_label(label) for label in train_data['category']])
test_labels = np.array([get_label(label) for label in test_data['category']])

In [None]:
# Creating the Naive Bayes model
model = MultinomialNB()

# Train the model
model.fit(train_features, train_labels)

In [None]:
# Predict the labels
predictions = model.predict(test_features)
predicted_labels = [label_list[prediction] for prediction in predictions]

In [None]:
# Create a dataframe with actual and predicted values
results_df = test_data.copy()
results_df['Predicted'] = predicted_labels
# results_df

In [None]:
# Calculate accuracy and other metrics
accuracy = accuracy_score(test_labels, predictions)
print("Model Accuracy:{:.2f}%".format(accuracy*100))

Model Accuracy:55.21%


# SVM

In [9]:
# Perform train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [10]:
# Tokenization and feature extraction
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_data['headline'])
test_features = vectorizer.transform(test_data['headline'])

In [11]:
# Encoding the labels
label_list = train_data['category'].unique().tolist()
label_map = {label: i for i, label in enumerate(label_list)}

# Function to handle missing labels
def get_label(label):
    try:
        return label_map[label]
    except KeyError:
        return -1  # Assign a default label or handle as needed

train_labels = np.array([get_label(label) for label in train_data['category']])
test_labels = np.array([get_label(label) for label in test_data['category']])

In [12]:
# Creating the SVM model
svm_model = SVC(kernel='linear')

# Train the model
svm_model.fit(train_features, train_labels)

In [13]:
# Predict the labels
predictions = svm_model.predict(test_features)
predicted_labels = [label_list[prediction] for prediction in predictions]

In [14]:
# Create a dataframe with actual and predicted values
results_df = test_data.copy()
results_df['Predicted'] = predicted_labels
results_df

Unnamed: 0,headline,category,Predicted
24672,Jerry Seinfeld Has Something To Say About A 'S...,ENTERTAINMENT,ENTERTAINMENT
50616,Green Day Rocks The American Music Awards With...,ENTERTAINMENT,ENTERTAINMENT
40627,Reality Check: What Vouchers Can -- And Can’t ...,POLITICS,POLITICS
47162,Scientists Stumped By Thousands Of Dead Fish O...,GREEN,WORLD NEWS
36396,Looks Like Liam Payne And Cheryl Cole Named Th...,ENTERTAINMENT,ENTERTAINMENT
...,...,...,...
32242,"'We Mourn, We Fight, We Love': Moving Forward ...",QUEER VOICES,QUEER VOICES
43033,This Golden Retriever Has An Adorable Obsession,WEIRD NEWS,ENTERTAINMENT
23407,Radical Self-Care: 6 Ways Activism Is Good For...,IMPACT,HEALTHY LIVING
20052,Harrison Ford Helps Rescue Woman After She Cra...,ENTERTAINMENT,CRIME


In [52]:
# Calculate accuracy
accuracy = accuracy_score(test_data['category'], results_df['Predicted'])
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 54.25%


### Insights:

Accuracy for above models

> Logistic Regression - 55.71 %,

> Naive Bayes - 55.21 %,

> SVM - 54.25 %

*   Logistic regression has higher accuracy than SVM and Naive Bayes due to its ability to handle non-linear relationships between features and target variables more effectively.



To improve the accuracies;



*   Advanced text preprocessing techniques can be used, such as stemming, lemmatization, or removing stop words, to enhance the quality of the input.
*   Experiment with different feature extraction methods, such as TF-IDF or word embeddings, to capture more meaningful representations of the headlines.
*   Using more advance machine learning algorithms, such as random forests or gradient boosting, which may capture more complex patterns in the data.



