In [30]:
import pandas as pd
import numpy as np
import nltk
from nltk import sent_tokenize, word_tokenize

### Loading datasets

In [31]:
training = pd.read_csv("fake_or_real_news_training_prepared_prepared2.csv")
test = pd.read_csv("fake_or_real_news_test_prepared2.csv")

#### Merging Title and Text Columns

In [32]:
training['all_text'] = training.title.str.cat(training.text, sep=' ')

# Keeping only useful columns
train = training[['all_text', 'label']]

train.head()

Unnamed: 0,all_text,label
0,You Can Smell Hillary’s Fear Daniel Greenfield...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE
2,Kerry to go to Paris in gesture of sympathy U....,REAL
3,Bernie supporters on Twitter erupt in anger ag...,FAKE
4,The Battle of New York: Why This Primary Matte...,REAL


In [33]:
test['all_text'] = test.title.str.cat(test.text, sep=' ')

# Dropping ID, title and text columns
columns_to_drop3 = ["title", "text"]
test = test.drop(columns_to_drop3, axis = 1)

### DEALING WITH COLUMN "ALL_TEXT"

### Lower Case

In [34]:
train['all_text'] = train['all_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
test['all_text'] = test['all_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['all_text'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0    you can smell hillary’s fear daniel greenfield...
1    watch the exact moment paul ryan committed pol...
2    kerry to go to paris in gesture of sympathy u....
3    bernie supporters on twitter erupt in anger ag...
4    the battle of new york: why this primary matte...
Name: all_text, dtype: object

### Removing Punctuation

In [35]:
train['all_text'] = train['all_text'].str.replace('[^\w\s]','')
test['all_text'] = test['all_text'].str.replace('[^\w\s]','')
train['all_text'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0    you can smell hillarys fear daniel greenfield ...
1    watch the exact moment paul ryan committed pol...
2    kerry to go to paris in gesture of sympathy us...
3    bernie supporters on twitter erupt in anger ag...
4    the battle of new york why this primary matter...
Name: all_text, dtype: object

### Removing StopWords

In [36]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['all_text'] = train['all_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test['all_text'] = test['all_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['all_text'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0    smell hillarys fear daniel greenfield shillman...
1    watch exact moment paul ryan committed politic...
2    kerry go paris gesture sympathy us secretary s...
3    bernie supporters twitter erupt anger dnc trie...
4    battle new york primary matters primary day ne...
Name: all_text, dtype: object

### Lemmatization

In [37]:
# nltk.download('wordnet')
# to install it: pip install -U textblob

from textblob import Word

train['all_text'] = train['all_text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
test['all_text'] = test['all_text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['all_text'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0    smell hillary fear daniel greenfield shillman ...
1    watch exact moment paul ryan committed politic...
2    kerry go paris gesture sympathy u secretary st...
3    bernie supporter twitter erupt anger dnc tried...
4    battle new york primary matter primary day new...
Name: all_text, dtype: object

In [38]:
y = train.label 
train.drop("label", axis=1)

Unnamed: 0,all_text
0,smell hillary fear daniel greenfield shillman ...
1,watch exact moment paul ryan committed politic...
2,kerry go paris gesture sympathy u secretary st...
3,bernie supporter twitter erupt anger dnc tried...
4,battle new york primary matter primary day new...
5,tehran usa im immigrant grandparent 50 year ag...
6,girl horrified watch boyfriend left facetime s...
7,britain schindler dy 106 czech stockbroker sav...
8,fact check trump clinton commanderinchief foru...
9,iran reportedly make new push uranium concessi...


### Make training and test sets

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train['all_text'], y, test_size=0.2, random_state=53)

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

## Naive Bayes

In [41]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])
text_clf = text_clf.fit(X_train, y_train)

In [42]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
               }

In [43]:
gridsearch_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gridsearch_clf = gridsearch_clf.fit(X_train, y_train)

In [44]:
gridsearch_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}

### NaiveBayes Predictions

In [45]:
pred = gridsearch_clf.predict(X_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.897


## SVM

In [46]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)),
])

_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)

0.906801007556675

In [47]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__alpha': (1e-2, 1e-3),}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)

In [48]:
gs_clf_svm.best_params_

{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

### SVM Predictions

In [49]:
pred2 = gs_clf_svm.predict(X_test)
score2 = accuracy_score(y_test, pred2)
print("accuracy:   %0.3f" % score2)

accuracy:   0.901


## PassiveAggresiveClassifier

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveClassifier

linear_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('linear_clf', PassiveAggressiveClassifier(max_iter=50)),])
linear_clf = linear_clf.fit(X_train, y_train)

In [51]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               }

In [52]:
gridsearch_clf2 = GridSearchCV(linear_clf, parameters, n_jobs=-1)
gridsearch_clf2 = gridsearch_clf2.fit(X_train, y_train)

In [53]:
gridsearch_clf2.best_params_

{'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

### PassiveAggresiveClassifier Predictions

In [54]:
pred3 = gridsearch_clf2.predict(X_test)
score3 = accuracy_score(y_test, pred3)
print("accuracy:   %0.3f" % score3)

accuracy:   0.933


# GridSearchCV Best Scores Results

In [55]:
print("Best Score NaiveBayes                 :   %0.4f" % gridsearch_clf.best_score_)
print("Best Score SupportVectorMachine       :   %0.4f" % gs_clf_svm.best_score_)
print("Best Score PassiveAggresiveClassifier :   %0.4f" % gridsearch_clf2.best_score_)

Best Score NaiveBayes                 :   0.9026
Best Score SupportVectorMachine       :   0.9105
Best Score PassiveAggresiveClassifier :   0.9275


# Predictions Summary

In [56]:
print("Accuracy NaiveBayes                 :   %0.4f" % score)
print("Accuracy SupportVectorMachine       :   %0.4f" % score2)
print("Accuracy PassiveAggresiveClassifier :   %0.4f" % score3)

Accuracy NaiveBayes                 :   0.8967
Accuracy SupportVectorMachine       :   0.9005
Accuracy PassiveAggresiveClassifier :   0.9332


### Generating the submission file

In [57]:
prediction_for_submission = gridsearch_clf2.predict(test['all_text'])
submission = pd.DataFrame()
submission['ID'] = test.ID
submission['label'] = prediction_for_submission

submission.to_csv("Assignment_1_submission_Max_and_Louis.csv",index = False)