In [5]:
# %% [markdown]
# # Text Topic Classification
# 
# This Jupyter notebook code snippet demonstrates the setup for a text classification project using NLP and ML libraries. The code imports libraries, preprocesses data, and sets up classification models.

import pandas as pd
import numpy as np
import nltk
import seaborn as sns
import re
import gensim
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from collections import Counter

# Optional: Setup for better visuals
sns.set_theme(style="whitegrid")

# Function to download necessary NLTK resources
def download_nltk_resources():
    resources = ['punkt', 'stopwords', 'averaged_perceptron_tagger', 'wordnet']  # For tokenization, stopwords, POS tagging, and lemmatization
    for resource in resources:
        try:
            nltk.data.find(resource)
        except LookupError:
            nltk.download(resource)
    print('NLTK resources have been downloaded and verified.')

# Call the function to download resources if not already present
download_nltk_resources()

# Load the dataset for Task 1
def load_t1_df(filename):
    df = pd.read_csv(filename, usecols=['par_id', 'paragraph', 'has_entity', 'lexicon_count', 'difficult_words', 'last_editor_gender', 'category'])
    size = df.shape
    if not df.empty:
        print(f"{size} rows and columns (without `text_clarity`) loaded successfully for taks 1.")
    else:
        print("The dataset is empty.")
    return df

# Load the dataset for Task 2
def load_t2_df(filename):
    df = pd.read_csv(filename, usecols=['par_id', 'paragraph', 'has_entity', 'lexicon_count', 'difficult_words', 'last_editor_gender', 'category', 'text_clarity'])
    size = df.shape
    if not df.empty:
        print(f"{size} rows and columns loaded successfully for task 2.")
    else:
        print("The dataset is empty.")
    return df

'''
	- Function usage: df = load_t1_df('filename_with_path')
	- Replace `filename_with_path` with your original value.
'''

df = load_t1_df('dataset.csv')

# Clean the DataFrame
df.dropna(inplace=True)
print(f"Data shape after removing missing values: {df.shape}")

# Process 'has_entity' column and remove rows with 'data missing'
df = df[df['has_entity'] != 'data missing']
print("Removed rows with 'data missing' in the 'has_entity' column")

# Splitting the "has_entity" column into three separate binary columns
df['ORG'] = df['has_entity'].apply(lambda x: 'ORG_YES' in x).astype(int)
df['PRODUCT'] = df['has_entity'].apply(lambda x: 'PRODUCT_YES' in x).astype(int)
df['PERSON'] = df['has_entity'].apply(lambda x: 'PERSON_YES' in x).astype(int)

# Load the Word2Vec model
file_path = './WordEmbeddings/GoogleNews-vectors-negative300.bin'  # Adjust the file path accordingly
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=True)

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to map NLTK's POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if unmatched

# Define the function to clean, stem, lemmatize, and preprocess text
def process_text(text):
    # Clean text
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize and remove stopwords
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    # POS tagging on stemmed tokens
    pos_tags = nltk.pos_tag(stemmed_tokens)

    # Lemmatization with POS tagging
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]

    # Combine stemming and lemmatization effects by preferring lemmatized tokens
    final_tokens = lemmatized_tokens  # Optionally, could use stemmed_tokens or a combination based on experimentation

    # Vectorization using Word2Vec
    valid_tokens = [word for word in final_tokens if word in word2vec_model]
    if valid_tokens:
        vector = np.mean([word2vec_model[token] for token in valid_tokens], axis=0)
    else:
        vector = np.zeros(300)  # Assuming Word2Vec vectors are of size 300

    return vector

# Preprocess the paragraphs to vectors
df['vector'] = df['paragraph'].apply(process_text)

# Combine all features into X
X = np.hstack((np.array(df['vector'].tolist()), df[['ORG', 'PRODUCT', 'PERSON']].values))
y = df['category'].str.lower()

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Check if the features are non-negative for MultinomialNB suitability
features_non_negative = (X_train >= 0).all()

# Define the classifiers
classifiers = {
    "SVM": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "MLP": MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=500)
}

# Add MultinomialNB only if features are non-negative
if features_non_negative:
    classifiers["Multinomial Naive Bayes"] = MultinomialNB()
else:
    print("Skipped MultinomialNB due to negative values in the features.")

# Prepare the pipeline for SMOTE and RandomUnderSampler
pipeline = Pipeline([
    ('o', SMOTE(sampling_strategy='auto')),
    ('u', RandomUnderSampler(sampling_strategy='auto'))
])

# Apply SMOTE and Random Under-Sampling conditionally
class_distribution = Counter(y_train)
if min(class_distribution.values()) > 1:
    X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)
    print("Applied SMOTE and Random Under-Sampling.")
else:
    X_resampled, y_resampled = X_train, y_train
    print("SMOTE not applied due to insufficient samples in a class.")

# Training and evaluating classifiers
for name, clf in classifiers.items():
    clf.fit(X_resampled, y_resampled)
    y_pred = clf.predict(X_test)
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

# Cross-validation for each classifier
print("\n--- Cross-Validation Scores ---")
for name, clf in classifiers.items():
    if name != "Multinomial Naive Bayes":  # Cross-validation directly on resampled data for other classifiers
        cv_scores = cross_val_score(clf, X_resampled, y_resampled, cv=5)
        print(f"{name}: CV average score = {np.mean(cv_scores)}")
    else:
        # For MultinomialNB, ensure cross-validation is done on non-negative features if included
        if features_non_negative:
            cv_scores = cross_val_score(clf, X_train, y_train, cv=5)  # Assuming original X_train has only non-negative features
            print(f"{name}: CV average score = {np.mean(cv_scores)}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\muhammedazhar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muhammedazhar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\muhammedazhar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\muhammedazhar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NLTK resources have been downloaded and verified.
(9347, 7) rows and columns (without `text_clarity`) loaded successfully for taks 1.
Data shape after removing missing values: (9268, 7)
Removed rows with 'data missing' in the 'has_entity' column
Skipped MultinomialNB due to negative values in the features.
Applied SMOTE and Random Under-Sampling.
--- SVM ---
Accuracy: 0.8555976203353164
              precision    recall  f1-score   support

           0       0.75      0.83      0.79       309
           1       0.91      0.85      0.88       596
           2       0.80      0.88      0.84        42
           3       0.83      0.86      0.84       511
           4       0.91      0.88      0.90       391

    accuracy                           0.86      1849
   macro avg       0.84      0.86      0.85      1849
weighted avg       0.86      0.86      0.86      1849

--- Random Forest ---
Accuracy: 0.8312601406165495
              precision    recall  f1-score   support

           0   