# NLP Project

### GitHub Link -> https://github.com/mohitgandhi2910

## Import libraries and check out data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
nltk.download("stopwords")
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mxg172130\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df_newsdata = pd.read_csv("fake_or_real_news.csv")

In [3]:
df_newsdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
Unnamed: 0    6335 non-null int64
title         6335 non-null object
text          6335 non-null object
label         6335 non-null object
dtypes: int64(1), object(3)
memory usage: 198.0+ KB


In [4]:
df_newsdata.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
df_newsdata.drop('Unnamed: 0',axis=1,inplace=True)

In [6]:
df_newsdata['length'] = df_newsdata['text'].apply(len)

In [7]:
df_newsdata.head()

Unnamed: 0,title,text,label,length
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,7518
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,2646
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,2543
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,2660
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,1840


In [8]:
# Checking how balanced the observation groups are.
df_newsdata.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [9]:
# Detailed Description grouped by real news and fake news
df_newsdata.groupby('label').describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FAKE,3164.0,4121.04646,5680.232733,1.0,1283.5,2558.0,5027.0,115372.0
REAL,3171.0,5292.160202,4348.288284,43.0,2729.5,4683.0,6829.5,44039.0


## Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
# Make training and test sets 
X_train, X_test, y_train, y_test = train_test_split(df_newsdata['text'], df_newsdata['label'], test_size=0.3, random_state=101)

In [12]:
import string
from nltk.corpus import stopwords

In [13]:
stop_words = stopwords.words("english")

In [14]:
def message(mess):
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stop_words]
sample = df_newsdata.head()
sample['text'].apply(message)

0    [Daniel, Greenfield, Shillman, Journalism, Fel...
1    [Google, Pinterest, Digg, Linkedin, Reddit, St...
2    [US, Secretary, State, John, F, Kerry, said, M...
3    [—, Kaydee, King, KaydeeKing, November, 9, 201...
4    [primary, day, New, York, frontrunners, Hillar...
Name: text, dtype: object

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
from sklearn.pipeline import Pipeline

In [18]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=message)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

In [19]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function message at 0x000000000C268840>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=Non...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [20]:
predictions = pipeline.predict(X_test)

## Evaluation of the Model

** Check precision, recall, f1-score using classification report **

In [21]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(predictions,y_test))

             precision    recall  f1-score   support

       FAKE       0.64      0.99      0.77       607
       REAL       0.99      0.73      0.84      1294

avg / total       0.88      0.81      0.82      1901



# End of Project