# <center>Check My Carrots</center>

### Importing the Libraries

In [64]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
import pickle

### Loading The Data Set

In [2]:
df = pd.read_csv("news_dataset.csv")

In [3]:
df.shape

(28711, 5)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,content,publication,label
0,0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,100percentfedup,fake
1,1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,100percentfedup,fake
2,2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,100percentfedup,fake
3,3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,100percentfedup,fake
4,4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,100percentfedup,fake


### Data Cleaning

In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().any()

Unnamed: 0     False
title          False
content        False
publication    False
label          False
dtype: bool

In [7]:
labels=df.label
labels.head()

0    fake
1    fake
2    fake
3    fake
4    fake
Name: label, dtype: object

### Vectorising the Articles

In [10]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [11]:
tfidf_train = tfidf_vectorizer.fit_transform(x_train)

In [12]:
tfidf_test  = tfidf_vectorizer.transform(x_test)

### Building the Models

#### 1. Passive Aggressive Classifier

In [27]:
pac = PassiveAggressiveClassifier(max_iter=50)

In [28]:
pac.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [29]:
y_pred=pac.predict(tfidf_test)

In [30]:
score_pac = accuracy_score(y_test,y_pred)

In [38]:
print('Accuracy: ' + str(round(score_pac*100,2)) + "%")

Accuracy: 93.96%


In [32]:
confusion_matrix(y_test,y_pred, labels=['fake','real'])

array([[2231,  195],
       [ 143, 3028]])

#### 2. Multinomial Bayes

In [33]:
Bayes_Model = MultinomialNB()

In [34]:
Bayes_Model.fit(tfidf_train,y_train)

MultinomialNB()

In [35]:
score_MNB = Bayes_Model.score(tfidf_test, y_test)

In [39]:
print('Accuracy: ' + str(round(score_MNB*100,2))+ "%")

Accuracy: 70.25%


#### 3. Stochastic Gradient Descent Classifier

In [43]:
SGD_Model = SGDClassifier(max_iter=50)

In [44]:
SGD_Model.fit(tfidf_train,y_train)

SGDClassifier(max_iter=50)

In [46]:
score_SGD = SGD_Model.score(tfidf_test, y_test)

In [47]:
print('Accuracy: ' + str(round(score_SGD*100,2))+ "%")

Accuracy: 93.73%


#### 4. Logistic Regression

In [52]:
Regression_Model = LogisticRegression()

In [53]:
Regression_Model.fit(tfidf_train,y_train)

LogisticRegression()

In [54]:
score_LR = Regression_Model.score(tfidf_test, y_test)

In [55]:
print('Accuracy: ' + str(round(score_LR*100,2))+ "%")

Accuracy: 92.53%


#### 5. Support Vector Machine

In [66]:
SVM_Model = LinearSVC()

In [67]:
SVM_Model.fit(tfidf_train,y_train)

LinearSVC()

In [68]:
score_SVM = SVM_Model.score(tfidf_test, y_test)

In [69]:
print('Accuracy: ' + str(round(score_SVM*100,2))+ "%")

Accuracy: 94.05%


### Score Comparision

In [70]:
data = {'Machine Learning Algorithm': ['Passive Aggressive Classifier', 
                                       'Multinomial Bayes',
                                       'Stochastic Gradient Descent Classifier',
                                       'Logistic Regression',
                                       'Support Vector Machine'],
        'Accuracy': [score_pac,score_MNB,score_SGD,score_LR,score_SVM]
        }

scores = pd.DataFrame(data, columns = ['Machine Learning Algorithm', 'Accuracy'])

In [74]:
scores

Unnamed: 0,Machine Learning Algorithm,Accuracy
0,Passive Aggressive Classifier,0.939611
1,Multinomial Bayes,0.702519
2,Stochastic Gradient Descent Classifier,0.937288
3,Logistic Regression,0.925317
4,Support Vector Machine,0.940504


### Building a Machine Learning Pipeline

In [76]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('pac', PassiveAggressiveClassifier())])

In [77]:
x_train,x_test,y_train,y_test=train_test_split(df['content'], labels, test_size=0.2, random_state=7)

In [78]:
x_train = x_train.apply(lambda x: np.str_(x))
x_test = x_test.apply(lambda x: np.str_(x))

In [79]:
pipeline.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('pac', PassiveAggressiveClassifier())])

In [80]:
pipeline.score(x_test,y_test)

0.9394318384849026

In [81]:
pred = pipeline.predict(x_test)

In [82]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

        fake       0.94      0.92      0.93      2426
        real       0.94      0.96      0.95      3171

    accuracy                           0.94      5597
   macro avg       0.94      0.94      0.94      5597
weighted avg       0.94      0.94      0.94      5597



In [83]:
print(confusion_matrix(y_test,pred))

[[2228  198]
 [ 141 3030]]


### Pickling the Data 

In [84]:
with open('model.pk1','wb') as handle:
    pickle.dump(pipeline, handle, protocol = pickle.HIGHEST_PROTOCOL)

## The End :)