In [51]:
# Import libraries
import pandas as pd
import joblib 
import numpy as np
from imblearn.over_sampling import RandomOverSampler
import sklearn
from sklearn.model_selection import train_test_split

In [52]:
# Load json file
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

# Keep only relevant columns
df = df[['headline', 'category']]

# Factorize category_id
df['category_id'], categories = pd.factorize(df['category'])
df['category_id'] = df['category_id'] + 1

In [53]:
# Oversample the minority class to address class imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df[['headline']], df['category_id'])

# Get the category needed for testing
X = X_resampled['headline']
y = y_resampled

# Test train split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size = 0.30, random_state = 90)
print(X_train.shape)
print(X_test.shape)

(1046698,)
(448586,)


In [54]:
# Load the model
lr = joblib.load('lr_model.joblib')

In [55]:
# Test model on random healdines (only one category)
y_test = ["Biden to Sign Executive Order That Aims to Make Child Care Cheaper",
       "Google Stock Loses $57 Billion Amid Microsoft's AI 'Lead'—And \
       Reports It Could Be Replaced By Bing On Some Smartphones",
       "Poland suspends food imports from Ukraine to assist its farmers",
       "Can AI Solve The Air Traffic Control Problem? Let's Find Out",
       "Woman From Odisha Runs 42.5 KM In UK Marathon Wearing A Saree",
       "Hillary Clinton: Trump cannot win the election - but Biden will",
       "Jennifer Aniston and Adam Sandler starrer movie 'Murder Mystery 2' \
       got released on March 24, this year"]

y_pred = lr.predict(y_test)
     
for i in range(len(y_test)): 
    print(f"{categories[y_pred[i]-1]} : {y_test[i]}")

POLITICS : Biden to Sign Executive Order That Aims to Make Child Care Cheaper
TECH : Google Stock Loses $57 Billion Amid Microsoft's AI 'Lead'—And        Reports It Could Be Replaced By Bing On Some Smartphones
WORLD NEWS : Poland suspends food imports from Ukraine to assist its farmers
TECH : Can AI Solve The Air Traffic Control Problem? Let's Find Out
WORLD NEWS : Woman From Odisha Runs 42.5 KM In UK Marathon Wearing A Saree
POLITICS : Hillary Clinton: Trump cannot win the election - but Biden will
ENTERTAINMENT : Jennifer Aniston and Adam Sandler starrer movie 'Murder Mystery 2'        got released on March 24, this year


In [56]:
# Test model on random healdines (three categories with match percentage)
y_pred_prob = lr.predict_proba(y_test)

# Get the top three categories and probabilities for each prediction
top_three_indices = np.argsort(y_pred_prob, axis=1)[:, -3:]
top_three_probs = np.sort(y_pred_prob, axis=1)[:, -3:]
top_three_categories = categories[top_three_indices]

# Print the results for each prediction
for j in range(len(y_test)):
    print(f"Headline: {y_test[j]}")
    for i in range(2, -1, -1):
        print(f"{top_three_categories[j, i]}: {top_three_probs[j, i] * 100:.2f}%")
    print()

Headline: Biden to Sign Executive Order That Aims to Make Child Care Cheaper
POLITICS: 41.11%
U.S. NEWS: 8.69%
PARENTING: 4.86%

Headline: Google Stock Loses $57 Billion Amid Microsoft's AI 'Lead'—And        Reports It Could Be Replaced By Bing On Some Smartphones
TECH: 99.71%
BUSINESS: 0.09%
THE WORLDPOST: 0.02%

Headline: Poland suspends food imports from Ukraine to assist its farmers
WORLD NEWS: 74.80%
WORLDPOST: 9.81%
TASTE: 2.94%

Headline: Can AI Solve The Air Traffic Control Problem? Let's Find Out
TECH: 11.17%
HEALTHY LIVING: 9.19%
SCIENCE: 8.60%

Headline: Woman From Odisha Runs 42.5 KM In UK Marathon Wearing A Saree
WORLD NEWS: 25.15%
SPORTS: 22.70%
WEIRD NEWS: 19.28%

Headline: Hillary Clinton: Trump cannot win the election - but Biden will
POLITICS: 99.79%
WOMEN: 0.12%
COMEDY: 0.02%

Headline: Jennifer Aniston and Adam Sandler starrer movie 'Murder Mystery 2'        got released on March 24, this year
ENTERTAINMENT: 95.40%
CRIME: 0.76%
COMEDY: 0.52%



  top_three_categories = categories[top_three_indices]


In [57]:
# Test model on random headlines (categories with more than 5% probability)
y_pred_prob = lr.predict_proba(y_test)

# Get the categories and probabilities for each prediction that are above 5%
above_five_indices = np.argwhere(y_pred_prob > 0.05)
above_five_probs = y_pred_prob[above_five_indices[:, 0], above_five_indices[:, 1]]
above_five_categories = categories[above_five_indices[:, 1]]

# Print the results for each prediction
for j in range(len(y_test)):
    print(f"Headline: {y_test[j]}")
    print(f"Categories and probabilities:")
    # Find the indices of the categories and probabilities for the current prediction
    current_indices = np.where(above_five_indices[:, 0] == j)[0]
    # Sort them in descending order of probability
    sorted_indices = np.argsort(above_five_probs[current_indices])[::-1]
    # Print them in order
    for i in sorted_indices:
        print(f"{above_five_categories[current_indices[i]]}: {above_five_probs[current_indices[i]] * 100:.2f}%")
    print()

Headline: Biden to Sign Executive Order That Aims to Make Child Care Cheaper
Categories and probabilities:
POLITICS: 41.11%
U.S. NEWS: 8.69%

Headline: Google Stock Loses $57 Billion Amid Microsoft's AI 'Lead'—And        Reports It Could Be Replaced By Bing On Some Smartphones
Categories and probabilities:
TECH: 99.71%

Headline: Poland suspends food imports from Ukraine to assist its farmers
Categories and probabilities:
WORLD NEWS: 74.80%
WORLDPOST: 9.81%

Headline: Can AI Solve The Air Traffic Control Problem? Let's Find Out
Categories and probabilities:
TECH: 11.17%
HEALTHY LIVING: 9.19%
SCIENCE: 8.60%
BUSINESS: 7.98%
WEIRD NEWS: 7.48%
WOMEN: 5.96%
WORLD NEWS: 5.77%

Headline: Woman From Odisha Runs 42.5 KM In UK Marathon Wearing A Saree
Categories and probabilities:
WORLD NEWS: 25.15%
SPORTS: 22.70%
WEIRD NEWS: 19.28%
WOMEN: 6.12%

Headline: Hillary Clinton: Trump cannot win the election - but Biden will
Categories and probabilities:
POLITICS: 99.79%

Headline: Jennifer Aniston an