## import libraries

In [23]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
## NLP libraries to clean the text data
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
## vectorization technique TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer 
## For Splitting the dataset
from sklearn.model_selection import train_test_split
## model libraries 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

#Accuracy measuring library
from sklearn.metrics import accuracy_score


## Loading the data 

In [24]:
df = pd.read_csv("C:\\Users\\sai mohan reddy\\OneDrive\\Desktop\\data.csv")


In [25]:
df.shape

(4009, 4)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URLs      4009 non-null   object
 1   Headline  4009 non-null   object
 2   Body      3988 non-null   object
 3   Label     4009 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 125.4+ KB


In [27]:
df.describe()

Unnamed: 0,Label
count,4009.0
mean,0.466949
std,0.498969
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [28]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [29]:


df.tail()

Unnamed: 0,URLs,Headline,Body,Label
4004,http://beforeitsnews.com/sports/2017/09/trends...,Trends to Watch,Trends to Watch\n% of readers think this story...,0
4005,http://beforeitsnews.com/u-s-politics/2017/10/...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,0
4006,https://www.activistpost.com/2017/09/ron-paul-...,"Ron Paul on Trump, Anarchism & the AltRight",,0
4007,https://www.reuters.com/article/us-china-pharm...,China to accept overseas trial data in bid to ...,SHANGHAI (Reuters) - China said it plans to ac...,1
4008,http://beforeitsnews.com/u-s-politics/2017/10/...,Vice President Mike Pence Leaves NFL Game Beca...,Vice President Mike Pence Leaves NFL Game Beca...,0


In [30]:
df.columns

Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')

In [31]:

df.isnull().sum()

URLs         0
Headline     0
Body        21
Label        0
dtype: int64

## Data-PreProcessing 

#### 1.Removing the Null Values 
#### 2.Adding a new field 
#### 3.Drop features that are not needed 
#### 4.Text processing


In [32]:
df = df.copy()

### Removing the Null Values

#### As Body field has some empty fields, it can be handled in two ways:

    #### 1)Drop the 21 rows
    #### 2)Replace the null value with a dummy string

#### Here, I will be going with the 2nd option, because although dropping 21 rows would not affect the accuracy, as it is just a minute portion of our large dataset, it is never recommended.

#### I will be replacing the Null(Nan) values in 'Body' field with an empty string ('')

In [33]:
df['Body'] = df['Body'].fillna('')

In [34]:
df.isnull().sum()

URLs        0
Headline    0
Body        0
Label       0
dtype: int64

#### Adding a new column 

In [35]:
df['News'] = df['Headline']+df['Body']

In [36]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label,News
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,Four ways Bob Corker skewered Donald TrumpImag...
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,Linklater's war veteran comedy speaks to moder...
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,Jason Aldean opens 'SNL' with Vegas tributeCou...


In [37]:
df.columns

Index(['URLs', 'Headline', 'Body', 'Label', 'News'], dtype='object')

### Drop features that are not needed 

In [38]:
features_drop = ['URLs','Headline','Body']
df = df.drop(features_drop,axis=1)

In [39]:
df.columns

Index(['Label', 'News'], dtype='object')

### Text Processing 

#### 1)Remove symbols
#### 2)Remove stopwords
#### 3)Stemming

In [40]:
ps = PorterStemmer()
def wordopt(text):
    text = re.sub('[^a-zA-Z]',' ',text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    return text 

In [41]:
df['News'] = df['News'].apply(wordopt)

In [42]:
df.head()

Unnamed: 0,Label,News
0,1,four way bob corker skewer donald trumpimag co...
1,1,linklat war veteran comedi speak modern americ...
2,1,trump fight corker jeopard legisl agendath feu...
3,1,egypt cheiron win tie pemex mexican onshor oil...
4,1,jason aldean open snl vega tributecountri sing...


#### splitting Dataset 

In [43]:
X = df['News']
Y = df['Label']

x_train , x_test , y_train , y_test = train_test_split(X,Y,test_size=0.30)

### Vectorization 
#### This is used to handle our text data , by converting it into vectors.

In [44]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

## Model Fitting
#### I will be fitting my data onto 3 classifications models 

#### 1]Logistic Regression 
#### 2]SVM 
#### 3]RandomForestClassifier 

In [45]:
LR_model = LogisticRegression()

LR_model.fit(xv_train,y_train)

lr_y_pred = LR_model.predict(xv_test)

score = accuracy_score(y_test,lr_y_pred)
print('Accuracy of LR model is', score)

Accuracy of LR model is 0.9742310889443059
