Importing all necessary libraries. Additionally, downlaoding resources required by the NLTK (Natural Language Toolkit) library. Resources such as tokenizers, stop words, sentiment lexicons, named entity chunkers, and other data used by NLTK for natural language processing(NLP) tasks.

In [None]:
import os
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from string import punctuation
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_selection import SelectKBest, chi2
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Mounted at /content/drive


This part of the code performing NER, Sentiment analysis and preprocessing of the text.


In [None]:
# Function to perform NER on a given text
def perform_ner(text):
    sentences = nltk.sent_tokenize(text)
    ner_entities = []
    for sentence in sentences:
        words1 = word_tokenize(sentence)
        pos_tags = nltk.pos_tag(words1)
        ner_tags = nltk.ne_chunk(pos_tags)
        for chunk in ner_tags:
            if hasattr(chunk, 'label'):
                entity1 = ' '.join(c[0] for c in chunk.leaves())
                entity_type = chunk.label()
                ner_entities.append((entity1, entity_type))
    return ner_entities

# Initialize the VADER sentiment analyzer
sen = SentimentIntensityAnalyzer()
# Function to perform sentiment analysis on a given text
def analyze_sentiment(text):
    scores = sen.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'Positive'
    elif scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'


# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()
# Define a function for text preprocessing with lemmatization
def preprocess_text(text):
    # Tokenize the text and perform lemmatization
    token_data = word_tokenize(text)
    lemma_tokens = [lemmatizer.lemmatize(token) for token in token_data]

    # Remove punctuation and stopwords
    filteredtoken = [word.lower() for word in lemma_tokens if word.isalnum() and word.lower() not in stopwords.words('english')]

    # Join the tokens back into a single string
    preprocessed_text = ' '.join(filteredtoken)

    return preprocessed_text

In the below step , we are defining the path of the dataset. Initializing the empty list to store the preprocessed data. Reading each file from each category and performing NER, sentiment analysis and preprocessing. In the empty dataset appending the preprocessed data after applying these features.



In [None]:
# Define the path to the dataset

data_class_path = "C:/Users/c23084426/Downloads/bbc"
data = []
cat = ['tech', 'business', 'sport', 'politics', 'entertainment']

# Dictionary to store NER results for each category
ner_results = {category: [] for category in cat}

# Perform NER for each text file in each category
for category in cat:
    cat_path = os.path.join(data_class_path, category)
    files = os.listdir(cat_path)
    for file in files:
        with open(os.path.join(cat_path, file), 'r', encoding='latin-1') as f:
            text = f.read()
            ner_entities = perform_ner(text)
            ner_results[category].extend(ner_entities)

            # Perform sentiment analysis
            sentiment = analyze_sentiment(text)

            # Preprocess the text before adding to data
            preprocessed_text = preprocess_text(text)

            data.append({'text': preprocessed_text, 'category': category, 'sentiment': sentiment})

# Create a DataFrame from the preprocessed data
df = pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       2225 non-null   object
 1   category   2225 non-null   object
 2   sentiment  2225 non-null   object
dtypes: object(3)
memory usage: 52.3+ KB


Applying TF-IDF method for word count frequency measure and then combing all features for feature selection. Using chi-squared test considering only 1000 top feature for further processing.

In [None]:
# Feature Extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000, norm='l2')  # Add norm='l2' for normalization # You can adjust max_features as needed
X_tfidf_Vector = tfidf_vectorizer.fit_transform(df['text'])
#X = tfidf_vectorizer.fit_transform(df['text'])

if X_tfidf_Vector.min() < 0:
    print("Negative values found in TF-IDF features!")

df['ner_entities'] = df['text'].apply(perform_ner)

# Convert sentiment to numerical values
df['sentiment_numeric'] = df['sentiment'].apply(lambda x: 1 if x == 'Positive' else (-1 if x == 'Negative' else 0))

# Get NER feature counts
ner_features = np.array([len(ner) for ner in df['ner_entities']]).reshape(-1, 1)

# Combine features
All_combined_features = np.concatenate((X_tfidf_Vector.toarray(), df['sentiment_numeric'].values.reshape(-1, 1), ner_features), axis=1)

# Ensure non-negativity of features
All_combined_features = np.maximum(All_combined_features, 0)

# Apply feature selection using Chi-squared test
k_best_features = 1000  # Specify the number of features to select
selector = SelectKBest(score_func=chi2, k=k_best_features)
selected_features_data = selector.fit_transform(All_combined_features, df['category'])

# Split the data into features (X) and target variable (y)
X = selected_features_data
y = df['category']


The matrix is created after chi-squatred test with top 1000 features, and splitted into training ,development and test set. We are using SVM classfier on traning ,development and test data to train the classification model and calculate accuracy. The high accuracy implies the classification model works properly in categorizing five different news articles.

In [None]:
# Split the data into training, development, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Initialize SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the SVM classifier on the training data
svm_classifier.fit(X_train, y_train)

# Predictions on the training set
train_predictions_class = svm_classifier.predict(X_train)

# Evaluate the performance of the model on the training set
print("Classification Report on Training Set:")
print(classification_report(y_train, train_predictions_class))

# Predictions on the development set
dev_predictions_class = svm_classifier.predict(X_dev)

# Evaluate the performance of the model on the development set
print("Classification Report on Development Set:")
print(classification_report(y_dev, dev_predictions_class))

# Predictions on the test set
test_predictions_class = svm_classifier.predict(X_test)

# Evaluate the performance of the model on the test set
print("Classification Report on Test Set:")
print(classification_report(y_test, test_predictions_class))

# Calculate accuracy on the training set
trainaccuracy_ondata = accuracy_score(y_train, train_predictions_class)
print("Accuracy on Training Set:", trainaccuracy_ondata)

# Calculate accuracy on the development set
devaccuracy_ondata = accuracy_score(y_dev, dev_predictions_class)
print("Accuracy on Development Set:", devaccuracy_ondata)

# Calculate accuracy on the test set
testaccuracy_ondata = accuracy_score(y_test, test_predictions_class)
print("Accuracy on Test Set:", testaccuracy_ondata)


Classification Report on Training Set:
               precision    recall  f1-score   support

     business       1.00      0.99      1.00       357
entertainment       1.00      1.00      1.00       270
     politics       0.99      1.00      0.99       292
        sport       1.00      1.00      1.00       357
         tech       1.00      1.00      1.00       281

     accuracy                           1.00      1557
    macro avg       1.00      1.00      1.00      1557
 weighted avg       1.00      1.00      1.00      1557

Classification Report on Development Set:
               precision    recall  f1-score   support

     business       0.97      0.97      0.97        76
entertainment       0.96      0.93      0.95        58
     politics       0.95      0.97      0.96        63
        sport       1.00      1.00      1.00        77
         tech       0.98      1.00      0.99        60

     accuracy                           0.98       334
    macro avg       0.97      0.97