<a href="https://colab.research.google.com/github/MrudulaJujjuru/AI_ML/blob/master/nlp/news_categorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **News Article Categorizer**
*   Media Monitoring: Quickly track news on specific topics.
*   Content Recommendations: Recommend articles based on users' interests.
*   Sentiment Analysis: Determine public sentiment towards political events, companies, etc.



# Step 1:  Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import io
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Step 2: Loading the Dataset from CSV file

In [3]:
'''
# This is to upload a file: Please use the one that is stored in the Data folder
# Download it as local file.
# Upload from google drive
from google.colab import files
uploaded = files.upload()
'''

'''
# To delete a single file: If there is any issue with the uploaded file you can delete it anytime
import os
file_to_delete = "/content/bbc-news-data.csv"
if os.path.exists(file_to_delete):
    os.remove(file_to_delete)
    print(f"File '{file_to_delete}' deleted.")
else:
    print(f"File '{file_to_delete}' not found.")
'''

# sep='/t' is used as the csv file is seperated with a tab spacing
df = pd.read_csv('/content/bbc-news-data.csv',sep='\t')
df["category"].unique()


Saving bbc-news-data.csv to bbc-news-data.csv


array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

# 3. Downloading NLTK Resources
*   punkt
*   stopwords
*   punkt_tab




In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Step 4. Preprocessing the Text


*   Tokenization
*   Stopword Removal



In [5]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Step 5. Applying Preprocessing to Dataset

In [6]:
df['processed_content'] = df['content'].apply(preprocess_text)

df['processed_content'].head()

Unnamed: 0,processed_content
0,"[quarterly, profits, us, media, giant, timewar..."
1,"[dollar, hit, highest, level, euro, almost, th..."
2,"[owners, embattled, russian, oil, giant, yukos..."
3,"[british, airways, blamed, high, fuel, prices,..."
4,"[shares, uk, drinks, food, firm, allied, domec..."


## Step 6. Text Vectorization

In [7]:
# Vectorize Text Using BOW
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['processed_content'].apply(' '.join))

#Vectorize Text Using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_content'].apply(' '.join))

## Step 7: Training our Models

In [10]:
#Training BOW model
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, df['category'], test_size=0.2, random_state=42)

nb_model1 = MultinomialNB()

nb_model1.fit(X_train_bow, y_train)

#Training TF-IDF model

X_train_tfidf, X_test_tfidf = train_test_split(X_tfidf, test_size=0.2, random_state=42)

nb_model2 = MultinomialNB()

nb_model2.fit(X_train_tfidf, y_train)

## Step 8 :Comparing Both Models

In [11]:
y_pred_bow = nb_model1.predict(X_test_bow)
print("BoW Model Performance:\n", classification_report(y_test, y_pred_bow))

y_pred_tfidf = nb_model2.predict(X_test_tfidf)
print("TF-IDF Model Performance:\n", classification_report(y_test, y_pred_tfidf))

BoW Model Performance:
                precision    recall  f1-score   support

     business       0.97      0.96      0.96       115
entertainment       0.99      0.94      0.96        72
     politics       0.94      0.97      0.95        76
        sport       1.00      0.99      1.00       102
         tech       0.95      0.99      0.97        80

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445

TF-IDF Model Performance:
                precision    recall  f1-score   support

     business       0.97      0.95      0.96       115
entertainment       0.98      0.89      0.93        72
     politics       0.90      0.97      0.94        76
        sport       0.98      0.99      0.99       102
         tech       0.95      0.99      0.97        80

     accuracy                           0.96       445
    macro avg       0.96      0.96      0.96       445
 weighted

## Step 9:Making Prediction

In [12]:
custom_text1 = "Artificial intelligence is revolutionizing the tech industry, with companies racing to develop the next big innovation."

print("Input text: ", custom_text1)

processed_custom_text = ' '.join(preprocess_text(custom_text1))

custom_text_bow = bow_vectorizer.transform([processed_custom_text])
custom_text_tfidf = tfidf_vectorizer.transform([processed_custom_text])

predicted_category_bow = nb_model1.predict(custom_text_bow)
print(f"Predicted Category (BoW): {predicted_category_bow[0]}")

predicted_category_tfidf = nb_model2.predict(custom_text_tfidf)
print(f"Predicted Category (TF-IDF): {predicted_category_tfidf[0]}")

Input text:  Artificial intelligence is revolutionizing the tech industry, with companies racing to develop the next big innovation.
Predicted Category (BoW): tech
Predicted Category (TF-IDF): tech
