# 0. Import library

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
#nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import seaborn as sb

# 1. Dataset preparation

In [2]:
def clean_text(text):
    
    #remove punctuation
    text = [char for char in text if char not in string.punctuation] 
    text_join = ''.join(text)
    
    #remove stopwords
    text_join_clean = [word for word in text_join.split() if word.lower() not in stopwords.words('english')] 
    
    #shorten word to their stem
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text_join_clean]
    text_join_clean = " ".join(stemmed_words)

    
    #return
    return text_join_clean

## Import the Data

In [3]:
news_df = pd.read_csv('Articles.csv',encoding='latin-1')
X = news_df.Article.apply(clean_text)
y = news_df.NewsType

## Split data

In [4]:
import sklearn.model_selection as model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33,random_state=42)

In [5]:
import pickle

pickle.dump(X_train, open('X_train.pkl', 'wb'))
pickle.dump(y_train, open('y_train.pkl', 'wb'))

In [6]:
pickle.dump(X_test, open('X_test.pkl', 'wb'))
pickle.dump(y_test, open('y_test.pkl', 'wb'))

In [7]:
import pickle

X_train = pickle.load(open('X_train.pkl', 'rb'))
y_train = pickle.load(open('y_train.pkl', 'rb'))

X_test = pickle.load(open('X_test.pkl', 'rb'))
y_test = pickle.load(open('y_test.pkl', 'rb'))

# 2. Feature Engineering

## Count Vector as features 

In [8]:
#Count Vector as features
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X_train)

# transform the training and validation data using count vectorizer object
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

## Tf-Idf Vectors as Features

In [9]:
#Tf-Idf Vectors as Features

# word level - we choose max number of words equal to 1000 except all words (100k+ words)
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=1000)
tfidf_vect.fit(X_train) # learn vocabulary and idf from training set

X_train_tfidf =  tfidf_vect.transform(X_train)

# assume that we don't have test set before
X_test_tfidf =  tfidf_vect.transform(X_test)

In [10]:
#Label Encoder
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
y_train_n = encoder.fit_transform(y_train)
y_test_n = encoder.fit_transform(y_test)
encoder.classes_

array(['business', 'sports'], dtype=object)

# 3. Build Model

## Naive-Bayes and SVM with Count Vector

In [11]:
from sklearn import naive_bayes,metrics,svm

### Naive-Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_count,y_train)
y_predict_train = clf.predict(X_train_count)
y_predict_test = clf.predict(X_test_count)
#y_predict_test

In [13]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

In [14]:
print(accuracy_score( y_test,y_predict_test))

0.9932508436445444


In [15]:
print(classification_report(y_test,y_predict_test))

              precision    recall  f1-score   support

    business       0.99      1.00      0.99       424
      sports       1.00      0.99      0.99       465

    accuracy                           0.99       889
   macro avg       0.99      0.99      0.99       889
weighted avg       0.99      0.99      0.99       889



In [16]:
cfs = confusion_matrix(y_test,y_predict_test)
# True Positives
TP = cfs[1, 1]
print('True_Positive:' ,TP)
# True Negatives
TN = cfs[0, 0]
print('True_Negative:' ,TN)
# False Positives
FP = cfs[0, 1]
print('False_Positive:' ,FP)
# False Negatives
FN = cfs[1, 0]
print('False_Negative:' ,FN)

True_Positive: 460
True_Negative: 423
False_Positive: 1
False_Negative: 5


### SMV

In [17]:
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train_count,y_train)
y_pred = model.predict(X_test_count)
#y_pred



In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

    business       0.99      0.99      0.99       424
      sports       0.99      0.99      0.99       465

    accuracy                           0.99       889
   macro avg       0.99      0.99      0.99       889
weighted avg       0.99      0.99      0.99       889



In [19]:
print(accuracy_score( y_test, y_pred))

0.9898762654668166


In [20]:
cfs = confusion_matrix(y_test,y_pred)
# True Positives
TP = cfs[1, 1]
print('True_Positive:' ,TP)
# True Negatives
TN = cfs[0, 0]
print('True_Negative:' ,TN)
# False Positives
FP = cfs[0, 1]
print('False_Positive:' ,FP)
# False Negatives
FN = cfs[1, 0]
print('False_Negative:' ,FN)

True_Positive: 461
True_Negative: 419
False_Positive: 5
False_Negative: 4


## Naive-Bayes and SMV with tf-idf vectors

### Naive-Bayes

In [21]:
#Naive-Bayes
clf.fit(X_train_tfidf,y_train)
y_predict_train_tfidf = clf.predict(X_train_tfidf)
y_predict_test_tfidf = clf.predict(X_test_tfidf)

In [22]:
print (classification_report(y_test,y_predict_test_tfidf))

              precision    recall  f1-score   support

    business       0.98      0.99      0.99       424
      sports       0.99      0.98      0.99       465

    accuracy                           0.99       889
   macro avg       0.99      0.99      0.99       889
weighted avg       0.99      0.99      0.99       889



In [23]:
print(accuracy_score( y_test, y_predict_test_tfidf))

0.9865016872890888


In [24]:
cfs = confusion_matrix(y_test,y_predict_test_tfidf)
# True Positives
TP = cfs[1, 1]
print('True_Positive:' ,TP)
# True Negatives
TN = cfs[0, 0]
print('True_Negative:' ,TN)
# False Positives
FP = cfs[0, 1]
print('False_Positive:' ,FP)
# False Negatives
FN = cfs[1, 0]
print('False_Negative:' ,FN)

True_Positive: 458
True_Negative: 419
False_Positive: 5
False_Negative: 7


### SMV

In [25]:
#SMV
model.fit(X_train_tfidf,y_train)
y_pred_train_tfidf = model.predict(X_train_tfidf)
y_pred_test_tfidf = model.predict(X_test_tfidf)

In [26]:
print (classification_report(y_test,y_pred_test_tfidf))

              precision    recall  f1-score   support

    business       0.98      1.00      0.99       424
      sports       1.00      0.98      0.99       465

    accuracy                           0.99       889
   macro avg       0.99      0.99      0.99       889
weighted avg       0.99      0.99      0.99       889



In [27]:
print(accuracy_score( y_test, y_pred_test_tfidf))

0.9898762654668166


In [28]:
cfs = confusion_matrix(y_test,y_pred_test_tfidf)
# True Positives
TP = cfs[1, 1]
print('True_Positive:' ,TP)
# True Negatives
TN = cfs[0, 0]
print('True_Negative:' ,TN)
# False Positives
FP = cfs[0, 1]
print('False_Positive:' ,FP)
# False Negatives
FN = cfs[1, 0]
print('False_Negative:' ,FN)

True_Positive: 458
True_Negative: 422
False_Positive: 2
False_Negative: 7



# 4. Evaluate the model with Cross-validation

## Cross-validation with CountVectorizer

In [29]:
 from sklearn.model_selection import KFold, cross_val_score
 kf = KFold(n_splits=5, shuffle=False)
# print(kf)
 count_vect.fit(X)
 X_count = count_vect.transform(X)
 scores1 = cross_val_score(clf, X_count, y, cv=10, scoring='accuracy')
 scores2 = cross_val_score(model, X_count, y, cv=10, scoring='accuracy')
 print('cross_val_score for naive-bayes :',scores1)
 print('cross_val_score for SVM:',scores2)



cross_val_score for naive-bayes : [0.9962963  1.         0.9962963  1.         0.99256506 0.98513011
 0.99628253 0.99256506 0.99626866 0.98880597]
cross_val_score for SVM: [0.9962963  1.         0.9962963  1.         0.99256506 0.99628253
 0.98141264 0.98513011 0.99626866 0.99626866]


## Cross-validation with TfidfVectorizer

In [30]:
tfidf_vect.fit(X) 
X_tfidf =  tfidf_vect.transform(X)
scores1 = cross_val_score(clf, X_tfidf, y, cv=10, scoring='accuracy')
scores2 = cross_val_score(model, X_tfidf, y, cv=10, scoring='accuracy')
print('cross_val_score for naive-bayes :',scores1)
print('cross_val_score for SVM:',scores2)

cross_val_score for naive-bayes : [0.9962963  1.         0.99259259 0.99259259 0.98513011 0.98513011
 0.98141264 0.98884758 0.99253731 0.98134328]
cross_val_score for SVM: [0.9962963  1.         0.9962963  0.99259259 0.98513011 0.99256506
 0.98884758 0.99256506 0.99626866 0.98880597]
