In [None]:
# Import libraries
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib 
import pandas as pd

In [None]:
# Load json file
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

# Keep only relevant columns
df = df[['headline', 'category']]

# Factorize category_id
df['category_id'], categories = pd.factorize(df['category'])
df['category_id'] = df['category_id'] + 1

# Dropping all rows containing NaN
df = df.dropna()

In [None]:
# Oversample the minority class to address class imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df[['headline']], df['category_id'])

# Get the category we need for testing
X = X_resampled['headline']
y = y_resampled

# Create test/train split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size = 0.30, random_state = 90)
print(X_train.shape)
print(X_test.shape)

In [None]:
# Initiate the model
lr = Pipeline([('cv', TfidfVectorizer()),
               ('clf', LogisticRegression(max_iter = 10000)),
              ])

# Train the logistic regression model on the training set
lr.fit(X_train,y_train)

In [None]:
# Make predictions on the test set
y_pred = lr.predict(X_test)

# Calculate the accuracy of the model
print(f"Accuracy is: {accuracy_score(y_pred,y_test)}")

In [None]:
# Save the model
joblib.dump(lr, 'lr_model.joblib')

In [None]:
# Test model on random healdines
news = ["Biden to Sign Executive Order That Aims to Make Child Care Cheaper",
       "Google Stock Loses $57 Billion Amid Microsoft's AI 'Lead'—And \
       Reports It Could Be Replaced By Bing On Some Smartphones",
       "Poland suspends food imports from Ukraine to assist its farmers",
       "Can AI Solve The Air Traffic Control Problem? Let's Find Out",
       "Woman From Odisha Runs 42.5 KM In UK Marathon Wearing A Saree",
       "Hillary Clinton: Trump cannot win the election - but Biden will",
       "Jennifer Aniston and Adam Sandler starrer movie 'Murder Mystery 2' \
       got released on March 24, this year"]

predicted = lr.predict(news)
     
for i in range(len(news)): 
    print(f"{categories[predicted[i]-1]} : {news[i]}")