<a href="https://colab.research.google.com/github/maitysuvo19/News-Articles-Classification/blob/main/Real_news_classification_with_tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing necessary libraries

In [None]:
import os
import pandas as pd
import numpy as np

#import feature extraction methods from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words

#pre-processing of text
import string
import re

#import classifiers from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

#import different metrics to evaluate the classifiers
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report 
from sklearn import metrics

#import time function from time module to track the training duration
from time import time



# Preparing the data

In [None]:
!unzip news.zip -d news

Archive:  news.zip
   creating: news/news article/News Articles/
   creating: news/news article/News Articles/business/
  inflating: news/news article/News Articles/business/001.txt  
  inflating: news/news article/News Articles/business/002.txt  
  inflating: news/news article/News Articles/business/003.txt  
  inflating: news/news article/News Articles/business/004.txt  
  inflating: news/news article/News Articles/business/005.txt  
  inflating: news/news article/News Articles/business/006.txt  
  inflating: news/news article/News Articles/business/007.txt  
  inflating: news/news article/News Articles/business/008.txt  
  inflating: news/news article/News Articles/business/009.txt  
  inflating: news/news article/News Articles/business/010.txt  
  inflating: news/news article/News Articles/business/011.txt  
  inflating: news/news article/News Articles/business/012.txt  
  inflating: news/news article/News Articles/business/013.txt  
  inflating: news/news article/News Articles/bus

In [None]:
# Step 1 - Get the file details
directory = []
file = []
title = []
text = []
label = []
datapath = '/content/news/news article' 
for dirname, _ , filenames in os.walk(datapath):
    #print('Directory: ', dirname)
    #print('Subdir: ', dirname.split('/')[-1])
    # remove the Readme.txt file
    # will not find file in the second iteration so we skip the error
    try:
        filenames.remove('README.TXT')
    except:
        pass
    for filename in filenames:
        directory.append(dirname)
        file.append(filename)
        label.append(dirname.split('/')[-1])
        #print(filename)
        fullpathfile = os.path.join(dirname,filename)
        with open(fullpathfile, 'r', encoding="utf8", errors='ignore') as infile:
            intext = ''
            firstline = True
            for line in infile:
                if firstline:
                    title.append(line.replace('\n',''))
                    firstline = False
                else:
                    intext = intext + ' ' + line.replace('\n','')
            text.append(intext)

#    

In [None]:
fulldf = pd.DataFrame(list(zip(directory, file, title, text, label)), 
               columns =['directory', 'file', 'title', 'text', 'label'])

df = fulldf.filter(['title','text','label'], axis=1)

print("FullDf : ", fulldf.shape)
print("DF : ", df.shape)

FullDf :  (2225, 5)
DF :  (2225, 3)


In [None]:
df.head()

Unnamed: 0,title,text,label
0,IBM puts cash behind Linux push,IBM is spending $100m (£52m) over the next t...,tech
1,Musicians 'upbeat' about the net,Musicians are embracing the internet as a wa...,tech
2,Commodore finds new lease of life,The once-famous Commodore computer brand cou...,tech
3,Blogs take on the mainstream,"Web logs or blogs are everywhere, with at le...",tech
4,Players sought for $1m prize,UK gamers are getting a chance to take part ...,tech


In [None]:
df['label'] =df.label.map({'sport':1, 'business':2,'politics':3,'tech':4,'entertainment':5})
df = df[["text","label"]]
df.head()

Unnamed: 0,text,label
0,IBM is spending $100m (£52m) over the next t...,4
1,Musicians are embracing the internet as a wa...,4
2,The once-famous Commodore computer brand cou...,4
3,"Web logs or blogs are everywhere, with at le...",4
4,UK gamers are getting a chance to take part ...,4


# Text Pre-processing

Typical steps involve tokenization, lower casing, removing, stop words, punctuation markers etc, and vectorization. Other processes such as stemming/lemmatization can also be performed. Here, we are performing the following steps: removing br tags, punctuation, numbers, and stopwords. While we are using sklearn's list of stopwords, there are several other stop word lists (e.g., from NLTK) or sometimes, custom stopword lists are needed depending on the task.

In [None]:
stopwords = stop_words.ENGLISH_STOP_WORDS
def clean(doc): #doc is a string of text
    doc = doc.replace("</br>", " ") #This text contains a lot of <br/> tags.
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stopwords])
    #remove punctuation and numbers
    return doc

# Modeling

In [None]:
#Step 1: train-test split
X = df.text #the column text contains textual data to extract features from
y = df.label #this is the column we are learning to predict. 
print(X.shape, y.shape)
# split X and y into training and testing sets. By default, it splits 75% training and 25% test
#random_state=1 for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(2225,) (2225,)
(1668,) (1668,)
(557,) (557,)


In [None]:
#Step 2-3: Preprocess and Vectorize train and test data
vect = TfidfVectorizer(preprocessor=clean) #instantiate a vectoriezer
X_train_dtm = vect.fit_transform(X_train)#use it to extract features from training data
#transform testing data (using training data's features)
X_test_dtm = vect.transform(X_test)
print(X_train_dtm.shape, X_test_dtm.shape)
#i.e., the dimension of our feature vector is 31195!

(1668, 31195) (557, 31195)


Naive Bayse Classifier

In [None]:
#Step 3: Train the classifier and predict for test data
nb = MultinomialNB() #instantiate a Multinomial Naive Bayes model
%time nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)#make class predictions for X_test_dtm
#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

CPU times: user 10.6 ms, sys: 3.58 ms, total: 14.2 ms
Wall time: 16 ms
Accuracy:  0.9676840215439856
              precision    recall  f1-score   support

           1       0.98      0.99      0.99       118
           2       0.95      0.98      0.96       133
           3       0.95      0.99      0.97       108
           4       0.99      0.95      0.97        93
           5       0.98      0.92      0.95       105

    accuracy                           0.97       557
   macro avg       0.97      0.97      0.97       557
weighted avg       0.97      0.97      0.97       557



Logistic Regression Classifier

In [None]:
logreg = LogisticRegression(class_weight="balanced") #instantiate a logistic regression model
logreg.fit(X_train_dtm, y_train) #fit the model with training data

#Make predictions on test data
y_pred_class = logreg.predict(X_test_dtm)

#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

Accuracy:  0.9784560143626571
              precision    recall  f1-score   support

           1       1.00      0.99      1.00       118
           2       0.98      0.97      0.97       133
           3       0.99      0.96      0.98       108
           4       0.97      0.98      0.97        93
           5       0.95      0.99      0.97       105

    accuracy                           0.98       557
   macro avg       0.98      0.98      0.98       557
weighted avg       0.98      0.98      0.98       557



Support Vector Machine

In [None]:
svm = LinearSVC(class_weight='balanced') #instantiate a support vector machine model
svm.fit(X_train_dtm, y_train) #fit the model with training data

#Make predictions on test data
y_pred_class = svm.predict(X_test_dtm)

#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

Accuracy:  0.9856373429084381
              precision    recall  f1-score   support

           1       1.00      0.99      1.00       118
           2       0.98      0.97      0.97       133
           3       0.99      0.99      0.99       108
           4       0.99      0.99      0.99        93
           5       0.97      0.99      0.98       105

    accuracy                           0.99       557
   macro avg       0.99      0.99      0.99       557
weighted avg       0.99      0.99      0.99       557



Our large feature vector could be creating a lot of noise in the form of very rarely occurring features that are not useful for learning. Let us change the count vectorizer to take a certain number of features as maximum.

In [None]:
#Step 2-3: Preprocess and Vectorize train and test data
vect = TfidfVectorizer(preprocessor=clean,max_features=5000) #instantiate a vectoriezer
X_train_dtm = vect.fit_transform(X_train)#use it to extract features from training data
#transform testing data (using training data's features)
X_test_dtm = vect.transform(X_test)
print(X_train_dtm.shape, X_test_dtm.shape)
#i.e., the dimension of our feature vector is 5000!

(1668, 5000) (557, 5000)


Naive Bayse Classifier

In [None]:
#Step 3: Train the classifier and predict for test data
nb = MultinomialNB() #instantiate a Multinomial Naive Bayes model
%time nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)#make class predictions for X_test_dtm
#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

CPU times: user 7.66 ms, sys: 812 µs, total: 8.47 ms
Wall time: 9.87 ms
Accuracy:  0.9694793536804309
              precision    recall  f1-score   support

           1       0.99      0.99      0.99       118
           2       0.96      0.96      0.96       133
           3       0.95      0.98      0.97       108
           4       0.98      0.95      0.96        93
           5       0.97      0.96      0.97       105

    accuracy                           0.97       557
   macro avg       0.97      0.97      0.97       557
weighted avg       0.97      0.97      0.97       557



Logistic Regression Classifier

In [None]:
logreg = LogisticRegression(class_weight="balanced") #instantiate a logistic regression model
logreg.fit(X_train_dtm, y_train) #fit the model with training data

#Make predictions on test data
y_pred_class = logreg.predict(X_test_dtm)

#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

Accuracy:  0.9748653500897666
              precision    recall  f1-score   support

           1       1.00      0.99      1.00       118
           2       0.97      0.97      0.97       133
           3       0.97      0.97      0.97       108
           4       0.97      0.97      0.97        93
           5       0.96      0.97      0.97       105

    accuracy                           0.97       557
   macro avg       0.97      0.97      0.97       557
weighted avg       0.97      0.97      0.97       557



Support Vector Machine

In [None]:
svm = LinearSVC(class_weight='balanced') #instantiate a support vector machine model
svm.fit(X_train_dtm, y_train) #fit the model with training data

#Make predictions on test data
y_pred_class = svm.predict(X_test_dtm)

#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

Accuracy:  0.9838420107719928
              precision    recall  f1-score   support

           1       1.00      0.99      1.00       118
           2       0.98      0.97      0.97       133
           3       0.98      1.00      0.99       108
           4       0.99      0.99      0.99        93
           5       0.97      0.97      0.97       105

    accuracy                           0.98       557
   macro avg       0.98      0.98      0.98       557
weighted avg       0.98      0.98      0.98       557

