In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
##Data Load
dataset = pd.read_csv('dataset_modified.csv')

##Take neccessarry columns
text = dataset['text']
label = dataset['label']

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,23698,House intel panel chief says did not meet Trum...,WASHINGTON (Reuters) - U.S. House Intelligence...,politicsNews,"March 27, 2017",1.0
1,28188,JUDGE NAPOLITANO: Samsung Allowed British Inte...,.@Judgenap: Samsung allowed British intelligen...,left-news,"Mar 9, 2017",0.0
2,41061,ATLANTA: PANDEMONIUM As World’s Largest Airpor...,There has to something more to this story than...,politics,"Dec 17, 2017",0.0
3,36180,Brazil prosecutors seek to extend Batista dete...,SAO PAULO (Reuters) - The Office of Brazil s P...,worldnews,"September 14, 2017",1.0
4,40515,CHARLES BARKLEY Says Anyone Who Criticizes Oba...,"On March 20, 2016 Barkley had this to say abou...",left-news,"Apr 8, 2016",0.0


In [4]:
dataset.shape


(53458, 6)

In [5]:
##divide data into sets
####1 training set
##2 test set
x_train, x_test, y_train, y_test= train_test_split(text ,label , test_size=0.2)


In [6]:
# tfidfvectorizer : term frequency inverse document ffreq. that define importance of keyword in web page
##remove stopwords (commonly appear words in webpage )
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

In [7]:
## model
## pac is online learning algo. THAT reminds passive card classification and turns aggreasive/miscalculation
##used to calculate accuracy score
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
accuracy = score*100
print (accuracy)

98.16685372240927


In [8]:
#pipeline utility function 
##used to train function which is used to train data and transform to text data without 
#doing it individually when we each time we perform
#here also remove the stopwords
#apply multinomialNB

pline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                    ('nbmodel', MultinomialNB())])

In [9]:
#train data

pline.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [10]:
acc_score= pline.score(x_test, y_test)
accuracy = acc_score*100
print ('accuracy',accuracy)

accuracy 92.47100635989524


In [11]:
##Report : performance evaluation table
pred =pline.predict(x_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93      5518
         1.0       0.92      0.92      0.92      5174

    accuracy                           0.92     10692
   macro avg       0.92      0.92      0.92     10692
weighted avg       0.92      0.92      0.92     10692



In [26]:
##Confusion matrix

print(confusion_matrix(y_test, pred))

TypeError: plot_confusion_matrix() got an unexpected keyword argument 'label'

In [None]:
##Convert model into pickle to use in the web app
import pickle
with open('model_modified.pkl','wb') as handle:
    pickle.dump(pline,handle,protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
loaded_model = pickle.load(open('model_modified.pkl','rb'))
result = loaded_model.score(x_test, y_test)
print(result)