In [34]:
import numpy as np
import pandas as pd
import re
import pandas_profiling
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support

In [2]:
train =pd.read_csv('./Data/train.csv' ,  encoding='ISO-8859-1')
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didnât Even See Comeyâs...,Darrell Lucus,House Dem Aide: We Didnât Even See Comeyâs...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
pandas_profiling.ProfileReport(train)

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …



In [4]:
test =pd.read_csv('./Data/test.csv' ,  encoding='ISO-8859-1')
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. â After years of scorni..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you donât succeed, try a differe..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


## Data Exploration

In [5]:
train['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [6]:
train.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didnât Even See Comeyâs...,Darrell Lucus,House Dem Aide: We Didnât Even See Comeyâs...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0


In [7]:
train['author'].shape

(20800,)

In [8]:
train['author'].value_counts()

Pam Key                                              243
admin                                                193
Jerome Hudson                                        166
Charlie Spiering                                     141
John Hayward                                         140
                                                    ... 
Deplorable Revboâ¢                                    1
Bashar al-Assad                                        1
Frank Litsky                                           1
Energy Healing-Reiki                                   1
Eric Schmitt, Rukmini Callimachi and Anne Barnard      1
Name: author, Length: 4201, dtype: int64

**Checking whether same author is writing both unreliable and reliable news or not.**

In [9]:
train[train['author'] == "Pam Key"]['label'].value_counts()

0    242
1      1
Name: label, dtype: int64

In [10]:
train[train['author'] == "admin"]['label'].value_counts()

1    193
Name: label, dtype: int64

In [11]:
train[train['author'] == "Jerome Hudson"]['label'].value_counts()

0    166
Name: label, dtype: int64

In [12]:
train[train['author'] == "Charlie Spiering"]['label'].value_counts()

0    141
Name: label, dtype: int64

In [13]:
train[train['author'] == "John Hayward"]['label'].value_counts()

0    140
Name: label, dtype: int64

**By looking at these trends Author name is a very important parameter to identify the target.**
**Let's find out the list of authors who wrote in training data as well as in testing data.**

In [15]:
unique_authors_train = train['author'].unique()
unique_authors_test = test['author'].unique()

In [16]:
len(unique_authors_train)

4202

In [17]:
len(unique_authors_test)

1733

In [18]:
common_authors = set(unique_authors_train).intersection(set(unique_authors_test))

In [45]:
#common_authors

* **Creating a combined text column with the combination of Author , Title and text columns.**

In [20]:
# removing punctuation and non alpha-numeric characters
def text_cleaning(text):
    text = str(text)
    text = re.sub("[^a-zA-Z]", " ", text) 
    return text

train['text'] = train['text'].apply(text_cleaning)
train['title'] = train['title'].apply(text_cleaning)
train['author'] = train['author'].apply(text_cleaning)

test['text'] = test['text'].apply(text_cleaning)
test['title'] = test['title'].apply(text_cleaning)
test['author'] = test['author'].apply(text_cleaning)

In [21]:
test=test.fillna(' ')
train=train.fillna(' ')
test['Combined_text']=test['title']+' '+test['author']+test['text']
train['Combined_text']=train['title']+' '+train['author']+train['text']

In [22]:
train.head(2)

Unnamed: 0,id,title,author,text,label,Combined_text
0,0,House Dem Aide We Didn t Even See Comey s...,Darrell Lucus,House Dem Aide We Didn t Even See Comey s...,1,House Dem Aide We Didn t Even See Comey s...
1,1,FLYNN Hillary Clinton Big Woman on Campus ...,Daniel J Flynn,Ever get the feeling your life circles the rou...,0,FLYNN Hillary Clinton Big Woman on Campus ...


## Model 
1. Creating vector from TFidf transformer using unigrams and Bi-grams.
    * As author names would be an important factor to identify whether article is realistic or fake. So Auther names would easily captured using Bi-grams. And IDF score for these bigrams would be higher.
    
2. Using these vectors, I will apply below models and compare the results.
      * Adaboost Classifier
      * RandomForestClassifier
      * MultinomialNB
      * Logictic Regression 
    

In [23]:
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train['Combined_text'].values)
tfidf = transformer.fit_transform(counts)
test_counts = count_vectorizer.transform(test['Combined_text'].values)
test_tfidf = transformer.fit_transform(test_counts)

  idf = np.log(n_samples / df) + 1


In [24]:
#split in samples
targets = train['label'].values
X_train, X_test, y_train, y_test = train_test_split(tfidf, targets, random_state=0)

#### Applying Multiple machine learning algorithm and comparing the results

In [39]:
out = []
Adab= AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),n_estimators=5)
Adab.fit(X_train, y_train)
y_pred = Adab.predict(X_test)
pre , rec , f1 , _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
out.append(["Adaboost classifier",Adab.score(X_train, y_train),Adab.score(X_test, y_test),pre , rec , f1])

In [40]:
## Random Forest classifier
Random= RandomForestClassifier(n_estimators=5)
Random.fit(X_train, y_train)
y_pred = Random.predict(X_test)
pre , rec , f1 , _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
out.append(["RandomForest classifier",Random.score(X_train, y_train),Random.score(X_test, y_test),pre , rec , f1])

In [41]:
## Naive Bayes
NB = MultinomialNB()
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)
pre , rec , f1 , _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
out.append(["NaiveBayes classifier",NB.score(X_train, y_train),NB.score(X_test, y_test),pre , rec , f1])

In [42]:
## Logistic Regression
logreg = LogisticRegression(C = 100,max_iter = 200)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
pre , rec , f1 , _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
out.append(["LogisticRegression",logreg.score(X_train, y_train),logreg.score(X_test, y_test),pre , rec , f1])

In [43]:
results_df = pd.DataFrame(out,columns = ['Model', 'Training Accuracy','Test Accuracy',"Precision","Recall","F1-Score"])
results_df.head()

Unnamed: 0,Model,Training Accuracy,Test Accuracy,Precision,Recall,F1-Score
0,Adaboost classifier,0.962949,0.954615,0.95465,0.954569,0.954603
1,RandomForest classifier,0.981859,0.808077,0.808256,0.808243,0.808077
2,NaiveBayes classifier,0.875064,0.786346,0.848025,0.789248,0.777568
3,LogisticRegression,1.0,0.973462,0.973578,0.973382,0.973452


Logistic Regression shows the best accuracy score compared to other models. 
Now I am applying Logistic Regression over testing data.

In [31]:
test_vector = count_vectorizer.transform(test['Combined_text'].values)
predictions = logreg.predict(test_vector)
pred=pd.DataFrame(predictions,columns=['label'])
pred['id']=test['id']

In [None]:
## Saving the results in the expected format.

In [33]:
pred[['id','label']].to_csv('submit.csv')