<a href="https://colab.research.google.com/github/lordlegacy/ML-API/blob/main/al-jazeera_news_article_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [22]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
df1 = pd.read_csv('df.csv')
df2 = pd.read_csv('economy.csv')
df3 = pd.read_csv('tech.csv')


In [14]:
full_data = pd.concat([df1, df2, df3], ignore_index=True).sample(frac=1).reset_index(drop=True)


In [15]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


In [20]:
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text


In [23]:
full_data['preprocessed_article'] = (full_data['article_content']
                                     .apply(preprocess_text)
                                     .apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
                                     .apply(lemmatize_text))

In [24]:
le = LabelEncoder()
full_data['encoded_category'] = le.fit_transform(full_data['category'])


In [25]:
X_train, X_test, y_train, y_test = train_test_split(full_data['preprocessed_article'], full_data['encoded_category'], test_size=0.2, random_state=42)


In [26]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)


In [27]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_bow, y_train)


In [28]:
y_pred = naive_bayes_model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


Accuracy: 0.8727477477477478
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       305
           1       0.84      0.83      0.83       315
           2       0.97      0.98      0.98       268

    accuracy                           0.87       888
   macro avg       0.88      0.88      0.88       888
weighted avg       0.87      0.87      0.87       888

