In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [3]:
fake_df = pd.read_csv('Fake.csv')
real_df = pd.read_csv('True.csv')

In [4]:
fake_df['class'] = 0 
real_df['class'] = 1

In [5]:
df = pd.concat([fake_df, real_df], ignore_index=True, sort=False)

In [6]:
df = df.drop(["title", "subject", "date"], axis=1)
df.head(5)

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


#### Randomly shuffling the dataframe 

In [7]:
df = df.sample(frac = 1)
df.head(5)

Unnamed: 0,text,class
10429,"This afternoon, The Washington Post wrote an o...",0
40500,"SOCHI, Russia (Reuters) - Russian President Vl...",1
37116,WASHINGTON (Reuters) - U.S. Secretary of State...,1
6663,"As you probably know by now, Republican presid...",0
32791,WASHINGTON (Reuters) - The United States on We...,1


In [8]:
# Reset index
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [9]:
# Cleaning
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

df["text"] = df["text"].apply(wordopt)

In [10]:
df.head(5)

Unnamed: 0,text,class
0,this afternoon the washington post wrote an o...,0
1,sochi russia reuters russian president vl...,1
2,washington reuters u s secretary of state...,1
3,as you probably know by now republican presid...,0
4,washington reuters the united states on we...,1


#### Defining dependent and independent variable as x and y

In [11]:
x = df["text"]
y = df["class"]

In [12]:
x.head(5)

0    this afternoon  the washington post wrote an o...
1    sochi  russia  reuters    russian president vl...
2    washington  reuters    u s  secretary of state...
3    as you probably know by now  republican presid...
4    washington  reuters    the united states on we...
Name: text, dtype: object

#### Splitting the dataset into training set and testing set. 

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#### Convert text to vectors

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorization = TfidfVectorizer()
x_train = vectorization.fit_transform(x_train)
x_test = vectorization.transform(x_test)

# Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression()
LR_model.fit(x_train,y_train)
LR_model.score(x_test, y_test)

0.9865256124721603

In [21]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Real News"

def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_x_test = new_def_test["text"]
    new_x_test = vectorization.transform(new_x_test)

    pred_LR = LR_model.predict(new_x_test)
    
    return LR_model.predict_proba(new_x_test),output_lable(pred_LR[0])

In [22]:
news = str("The head of a conservative Republican faction in the U.S. Congress, who voted this month for")
manual_testing(news)

(array([[0.51485875, 0.48514125]]), 'Fake News')

In [23]:
news = str("Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing")
manual_testing(news)

(array([[0.96437253, 0.03562747]]), 'Fake News')

In [18]:
X_new = x_test[3]

prediction = LR_model.predict(x_test)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1 0 0 ... 0 0 0]
The news is Fake


In [19]:
predictions_test = pd.DataFrame(LR_model.predict(x_test))
predictions_test_predict_proba = pd.DataFrame(LR_model.predict_proba(x_test))
test_id = pd.DataFrame(x_test["id"])

submission = pd.concat([test_id, predictions_test, predictions_test_predict_proba],axis=1)
submission.columns = ["id","Logistic","proba_0","proba_1"]
submission.to_csv("Logistic.csv",index=False)

IndexError: Index dimension must be <= 2