### IMPORT ALL MODULES

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import time
import pandas as pd
import numpy as np

### CREATE DATASET

In [5]:
realdataobj = pd.read_csv("newsdataset/True.csv")
fakedataobj = pd.read_csv("newsdataset/Fake.csv")

### VISUALISE DATASET

In [6]:
print("DATASET LOOKS LIKE : ")
print(realdataobj.head(4))

DATASET LOOKS LIKE : 
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   


### APPEND NEW COLUMN TO HOUSE TARGET REAL VS FAKE

In [7]:
realdataobj["Status"] = "Real"
fakedataobj["Status"] = "Fake"
joineddf = pd.concat([realdataobj, fakedataobj])
joinarray = joineddf.values
joindata = joinarray[:,0]
jointarget = joinarray[:,-1]
xtrain, xtest, ytrain, ytest = train_test_split(joindata, jointarget, random_state = 8)

### MEASURE OUR RAW DATASET

In [8]:
print("\nFull Dataset contains the following distrubutions : ")
print("- ", len(realdataobj.index), " Real News")
print("- ", len(fakedataobj.index), " Fake News")


Full Dataset contains the following distrubutions : 
-  21417  Real News
-  23481  Fake News


### VECTORISE COUNTS OF EACH WORD IN DATA

In [9]:
countvec = CountVectorizer().fit(xtrain)
xtrainvec = countvec.transform(xtrain)
print("\n",repr(xtrainvec))
words = countvec.get_feature_names()
xtestvec = countvec.transform(xtest)
print(words[2000:2020])


 <33673x19196 sparse matrix of type '<class 'numpy.int64'>'
	with 409456 stored elements in Compressed Sparse Row format>
['bharara', 'bhumibol', 'bi', 'biafra', 'bias', 'biased', 'bibi', 'bible', 'biblical', 'bicker', 'bicycle', 'bid', 'biden', 'bids', 'big', 'bigger', 'biggest', 'biggie', 'bigly', 'bigot']


### LIST ALL MODELS TO BE USED

In [11]:
modelnames = ["K Nearest Neighbour", "Logistic Regression", "Decision Tree", "Random Forest", "Kernel SVC", "Neural Network MLP"]
modellist = [KNeighborsClassifier(n_jobs = 4), LogisticRegression(max_iter = 10000, n_jobs = 4), DecisionTreeClassifier(), RandomForestClassifier(n_jobs = 4), SVC(max_iter = 10000), MLPClassifier()]
param_gridlist = [{'n_neighbors': [3, 4, 5, 6, 7]},{'C': [0.001, 0.01, 0.1, 1, 10]}, {'max_depth': [1, 3, 5, 7, 9]}, {'n_estimators': [2, 4, 6, 8, 10]}, {'C': [0.001, 0.01, 0.1, 1, 10]}, {'hidden_layer_sizes': [50]}]

#### BEST OF K NEAREST NEIGHBOURS
##### -takes 6.7 minutes to run!

In [9]:
start = time.time()
grid = GridSearchCV(modellist[0], param_gridlist[0], cv=5, n_jobs = 4)
grid.fit(xtrainvec, ytrain)
print("\nBest", modelnames[0], "ML algorithm best parameters : ")
print("Best cross-validation score: {:.2f}".format(grid.best_score_)) 
print("Best parameters: ", grid.best_params_)
print("Train Set Accuracy : ")
print(grid.score(xtrainvec, ytrain))
xtestvec = countvec.transform(xtest)
print("Test Set Accuracy : ")
print(grid.score(xtestvec, ytest))
end = time.time()


Best K Nearest Neighbour ML algorithm best parameters : 
Best cross-validation score: 0.79
Best parameters:  {'n_neighbors': 4}
Train Set Accuracy : 
0.8990585929379622
Test Set Accuracy : 
0.8038307349665924


### BEST OF LOGISTIC REGRESSIONS
#### -takes 13 secs to run!

In [12]:
start = time.time()
grid = GridSearchCV(modellist[1], param_gridlist[1], cv=5, n_jobs = 4)
grid.fit(xtrainvec, ytrain)
print("\nBest", modelnames[1], "ML algorithm best parameters : ")
print("Best cross-validation score: {:.2f}".format(grid.best_score_)) 
print("Best parameters: ", grid.best_params_)
print("Train Set Accuracy : ")
print(grid.score(xtrainvec, ytrain))
xtestvec = countvec.transform(xtest)
print("Test Set Accuracy : ")
print(grid.score(xtestvec, ytest))
end = time.time()


Best Logistic Regression ML algorithm best parameters : 
Best cross-validation score: 0.96
Best parameters:  {'C': 10}
Train Set Accuracy : 
0.9993169601758085
Test Set Accuracy : 
0.9614253897550111


### BEST OF DECISION TREES
#### -takes 7 secs to run!

In [14]:
start = time.time()
grid = GridSearchCV(modellist[2], param_gridlist[2], cv=5, n_jobs = 4)
grid.fit(xtrainvec, ytrain)
print("\nBest", modelnames[2], "ML algorithm best parameters : ")
print("Best cross-validation score: {:.2f}".format(grid.best_score_)) 
print("Best parameters: ", grid.best_params_)
print("Train Set Accuracy : ")
print(grid.score(xtrainvec, ytrain))
xtestvec = countvec.transform(xtest)
print("Test Set Accuracy : ")
print(grid.score(xtestvec, ytest))
end = time.time()


Best Decision Tree ML algorithm best parameters : 
Best cross-validation score: 0.83
Best parameters:  {'max_depth': 9}
Train Set Accuracy : 
0.8320612953998753
Test Set Accuracy : 
0.8303786191536748


### BEST OF RANDOM FORESTS
#### -takes 47 secs to run!

In [19]:
start = time.time()
grid = GridSearchCV(modellist[3], param_gridlist[3], cv=5, n_jobs = 4)
grid.fit(xtrainvec, ytrain)
print("\nBest", modelnames[3], "ML algorithm best parameters : ")
print("Best cross-validation score: {:.2f}".format(grid.best_score_)) 
print("Best parameters: ", grid.best_params_)
print("Train Set Accuracy : ")
print(grid.score(xtrainvec, ytrain))
xtestvec = countvec.transform(xtest)
print("Test Set Accuracy : ")
print(grid.score(xtestvec, ytest))
end = time.time()


Best Random Forest ML algorithm best parameters : 
Best cross-validation score: 0.93
Best parameters:  {'n_estimators': 10}
Train Set Accuracy : 
0.9964363139607401
Test Set Accuracy : 
0.932293986636971


### BEST OF KERNEL SVM CLASSIFIER
#### - takes 13 mins to run! on non scaled data
#### - takes 

In [21]:
start = time.time()
grid = GridSearchCV(modellist[4], param_gridlist[4], cv=5, n_jobs = 4)
grid.fit(xtrainvec, ytrain)
print("\nBest", modelnames[4], "ML algorithm best parameters : ")
print("Best cross-validation score: {:.2f}".format(grid.best_score_)) 
print("Best parameters: ", grid.best_params_)
print("Train Set Accuracy : ")
print(grid.score(xtrainvec, ytrain))
xtestvec = countvec.transform(xtest)
print("Test Set Accuracy : ")
print(grid.score(xtestvec, ytest))
end = time.time()




Best Kernel SVC ML algorithm best parameters : 
Best cross-validation score: 0.97
Best parameters:  {'C': 10}
Train Set Accuracy : 
1.0
Test Set Accuracy : 
0.9691759465478842


### BEST OF NEURAL NETWORK MLP CLASSIFIER
#### - takes 

In [None]:
start = time.time()
grid = GridSearchCV(modellist[5], param_gridlist[5], cv=5, n_jobs = -1)
grid.fit(xtrainvec, ytrain)
print("\nBest", modelnames[5], "ML algorithm best parameters : ")
print("Best cross-validation score: {:.2f}".format(grid.best_score_)) 
print("Best parameters: ", grid.best_params_)
print("Train Set Accuracy : ")
print(grid.score(xtrainvec, ytrain))
print("Test Set Accuracy : ")
print(grid.score(xtestvec, ytest))
end = time.time()

In [None]:
print(end-start)