## Read Dataset

In [2]:
import pandas as pd
df = pd.read_csv("uci-news-aggregator.csv")

In [3]:
df.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [4]:
# remove unwanted column and keep only title and category columns
df = df[['TITLE','CATEGORY']]

In [5]:
df.head()

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b


In [6]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

"""
since our target labels in text , we need to convert it to numeric values for that 
we can use sklearn label encoder <<<  class1,class2,class3 --> 1,2,3 >>
"""
label_encoder = preprocessing.LabelEncoder()
df['CATEGORY'] =  label_encoder.fit_transform( df['CATEGORY'] )

In [7]:
class_labels = label_encoder.classes_

# devide the dataset into train and test sets
# extract feature array and target labels into two place holders
X , Y =  df['TITLE'] , df['CATEGORY'].values
#splitting the data into 80 and 20 split
train_X, test_X, y_train, y_test = train_test_split( X , Y ,  test_size=0.2, 
                                                    random_state=42, shuffle=True , stratify = Y )

In [8]:
# train test dataset stats
print("FULL Dataset: {}".format( df.shape))
print("TRAIN Dataset: {}".format(len(train_X)))
print("TEST Dataset: {}".format(len(test_X)))

FULL Dataset: (422419, 2)
TRAIN Dataset: 337935
TEST Dataset: 84484


# 1. Model Traning

## a) normalize text

> remove english stop words from sentence

In [9]:
"""
define set of english stopwords , which will not be contain any valueable information for any text analysis task
in case we need to remove them
"""
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

STOPWORDS = set(stopwordlist)

# function to remove stop words
def cleaning_stopwords(text):
    """
    first split the sentence into words after that check if word inside the stopword list above , if so remove that word
    """
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

> remove punctuations from sentence 

In [10]:
import string
import re
# get most frequent punctuations from english language
english_punctuations = string.punctuation

# function to remove punctuations
def cleaning_punctuations(text):
    chars_clean = []
    for char in text :
      # iterate over every character and check if that belongs to punctuation , is so remove that
      if char not in english_punctuations  :
        chars_clean.append( char )
    # join the chars and restate the sentence
    text_nopunct = "".join( chars_clean ) 
    return text_nopunct

> stemming the words using nltk library

In [11]:
import nltk
from nltk.tokenize import TweetTokenizer

# initialize the poter stemmer 
st = nltk.PorterStemmer()
def stemming_on_text(data):
    # split the sentence into words and then apply stemmer to stem words into it's base 
    text = [st.stem(word) for word in data.split() ]
    return " ".join( text )

> clean numbers from senetnce

In [12]:
# function to remove numbers
def cleaning_numbers(data):
    # replace the number with blank
    return re.sub('[0-9]+', '', data)

In [13]:
def normalize_data( df ):
  """
  using apply method with pandas dataframe we can apply above functions to each column value inside 
  dataframe
  """
  # remove stop wrods
  df= df.apply(lambda text: cleaning_stopwords(text))

  # remove numbers
  df= df.apply(lambda x: cleaning_numbers(x))

  # apply stemmer
  df = df.apply(lambda x: stemming_on_text(x))

  # remove punchuation
  df= df.apply(lambda x: cleaning_punctuations(x) )

  return df

In [17]:
# apply dataset normalization
df_normalized_train = normalize_data( train_X )

In [18]:
# train dataset after normalize
df_normalized_train.head()

311397    appl supplier start make inch inch iphon next ...
277764    astronaut reid wiseman take twitter share life...
68461     possibl to use offic for ipad without offic su...
333723    the dow inch higher strength from disney micro...
385527    ​i think marvel is ly to everybodi about the n...
Name: TITLE, dtype: object

## b) Extract features and build feature set and feature vectorizer

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

> define the feature vectorization

In [20]:
"""
define the sklearn pipeline 

step 1. user counter vectorizer to vectorize each sentence based on vocabulary
step 2. use TFIDF feature extraction method (https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.YmL99nZByUk)
"""
# define the complete pipeline
feature_extractor = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
               ])

In [21]:
# extract features from train dataset , apply the above sklearn pipeline

X_features = feature_extractor.fit_transform( df_normalized_train )

## c) Use supervised learning algorithm Naïve Bayes & Logistic Regression to build a predictive model

> Naive Bayes Classification Model

In [22]:
# model 1. naive bayers classifier
nb_classifier = MultinomialNB()

# train the model using extracted features and target label set
nb_classifier.fit( X_features , y_train )

# model performance evaluation on train set
y_train_pred_nb = nb_classifier.predict( X_features )

# analyze the results
print('accuracy %s' % accuracy_score( y_train_pred_nb , y_train ))
# print the classification report on train dataset performance
print(classification_report( y_train , y_train_pred_nb  ,target_names = list(class_labels)  , zero_division = 0  ))

accuracy 0.9287022652285203
              precision    recall  f1-score   support

           b       0.90      0.92      0.91     92774
           e       0.95      0.97      0.96    121975
           m       0.97      0.86      0.91     36511
           t       0.91      0.91      0.91     86675

    accuracy                           0.93    337935
   macro avg       0.93      0.91      0.92    337935
weighted avg       0.93      0.93      0.93    337935



> Logistic Regression Model

In [23]:
# model 2. logistic regression classifier
LR_classifier = LogisticRegression( solver = 'lbfgs' , max_iter = 500  )

# train the model using extracted features and target label set
LR_classifier.fit( X_features , y_train )

# model performance evaluation on train set
y_train_pred_lr = LR_classifier.predict( X_features )

# analyze the results
print('accuracy %s' % accuracy_score( y_train_pred_lr , y_train ))
print(classification_report( y_train , y_train_pred_lr  ,target_names = list(class_labels)  , zero_division = 0  ))

accuracy 0.9534259546954296
              precision    recall  f1-score   support

           b       0.93      0.94      0.94     92774
           e       0.97      0.98      0.98    121975
           m       0.97      0.93      0.95     36511
           t       0.94      0.94      0.94     86675

    accuracy                           0.95    337935
   macro avg       0.95      0.95      0.95    337935
weighted avg       0.95      0.95      0.95    337935



# 2. Model Testing

## a) Normalize testing data

In [24]:
# apply normalization on test dataset
df_normalized_test = normalize_data( test_X )

In [25]:
# sample visualization on test dataset after normalize
df_normalized_test.head()

139452    astronom have found first earthsized habit zon...
196479                            video barclay cut job new
91525     us juri hit takeda eli lilli whop us billion p...
228371                     ciara  futur have a new babi boy
79276                      fruit veggi aplenti optim health
Name: TITLE, dtype: object

## b) Extract features using training feature vectorizer

In [26]:
# extract features from test dataset

X_features_test = feature_extractor.transform( df_normalized_test )

## c) Predict the news article class using training model naive bayes and logistic regression

In [27]:
# take predictions from trained model naive bayes classifier
y_test_pred_nb = nb_classifier.predict( X_features_test )

# take predictions from trained model logistic regression
y_test_pred_lr = LR_classifier.predict( X_features_test )

## d) Evaluate model performance

> performance analysis on naive bayes model

In [28]:
# analyze the results
print("Naive Bayes Model Performance Analysis\n")
print('accuracy %s' % accuracy_score( y_test_pred_nb , y_test ))
print(classification_report( y_test , y_test_pred_nb  ,target_names = list(class_labels)  , zero_division = 0  ))

Naive Bayes Model Performance Analysis

accuracy 0.9208844278206524
              precision    recall  f1-score   support

           b       0.89      0.91      0.90     23193
           e       0.94      0.97      0.96     30494
           m       0.97      0.84      0.90      9128
           t       0.90      0.90      0.90     21669

    accuracy                           0.92     84484
   macro avg       0.93      0.90      0.91     84484
weighted avg       0.92      0.92      0.92     84484



> performance analysis on logistic regression model

In [29]:
# analyze the results
print("Logistic Regression Model Performance Analysis\n")
print('accuracy %s' % accuracy_score( y_test_pred_lr , y_test ))
print(classification_report( y_test , y_test_pred_lr  ,target_names = list(class_labels)  , zero_division = 0  ))

Logistic Regression Model Performance Analysis

accuracy 0.9407935230339473
              precision    recall  f1-score   support

           b       0.92      0.92      0.92     23193
           e       0.96      0.98      0.97     30494
           m       0.96      0.91      0.93      9128
           t       0.93      0.92      0.92     21669

    accuracy                           0.94     84484
   macro avg       0.94      0.93      0.94     84484
weighted avg       0.94      0.94      0.94     84484

