## Required libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
import joblib
import pickle
import snscrape.modules.twitter as sntwitter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [10]:
# making lists to append all tweets/data to
real = []
satire = []

In [11]:
# using sntwitter to scrape tweets and append data to lists
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:cnnbrk').get_items()):
    if i>5000:
        break
    real.append([tweet.content])

In [12]:
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:enews').get_items()):
    if i>5000:
        break
    real.append([tweet.content])

In [13]:
real = pd.DataFrame(real)

In [14]:
real.to_csv('/Users/lucashawranke/Documents/data-science/sarcasm-scanner/data/real_tweets.csv') #mac
# real.to_csv('/Users/lucas/data-science/sarcasm-scanner/data/real_tweets.csv') #pc

In [15]:
real

Unnamed: 0,0
0,Nick Cannon is getting some rest after coming ...
1,Team USA eliminated from the World Cup after a...
2,A man has been arrested and charged with murde...
3,Right-wing conspiracy theorist Alex Jones file...
4,"America added a robust 263,000 jobs last month..."
...,...
9997,Someone sedate E! ❤️ Sandra Oh had a mini Grey...
9998,"Tori Spelling's Best Friend and ""Guncle"" Scout..."
9999,Zendaya and More Best Dressed Stars at the 202...
10000,Pete Davidson Makes Surprise Appearance at 202...


In [None]:
# while on my mac
# cnn_tweets = pd.read_csv('/Users/lucashawranke/Documents/data-science/sarcasm-scanner/data/cnn_tweets.csv')
# onion_tweets = pd.read_csv('/Users/lucashawranke/Documents/data-science/sarcasm-scanner/data/onion_tweets.csv')

# while on my pc
real_tweets = pd.read_csv('/Users/lucas/data-science/sarcasm-scanner/data/cnn_tweets.csv')
satire_tweets = pd.read_csv('/Users/lucas/data-science/sarcasm-scanner/data/onion_tweets.csv')

In [22]:
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:theonion').get_items()):
    if i>5000:
        break
    satire.append([tweet.content])

In [23]:
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:clickhole').get_items()):
    if i>5000:
        break
    satire.append([tweet.content])

In [24]:
satire = pd.DataFrame(satire)

In [25]:
satire.head()

Unnamed: 0,0
0,Zoo Assures Public Escaped Leopard Will Kill T...
1,Chobani Recalls Thousands Of Yogurts That Gave...
2,https://t.co/yDexD4IuWw
3,Determined Ant Requires Second Flicking https:...
4,It Instantly Clear Girlfriend Used To Fuck Guy...


In [27]:
satire.to_csv('/Users/lucashawranke/documents/data-science/sarcasm-scanner/data/satire_tweets.csv') #pc

In [28]:
satire

Unnamed: 0,0
0,Zoo Assures Public Escaped Leopard Will Kill T...
1,Chobani Recalls Thousands Of Yogurts That Gave...
2,https://t.co/yDexD4IuWw
3,Determined Ant Requires Second Flicking https:...
4,It Instantly Clear Girlfriend Used To Fuck Guy...
...,...
9997,Are You A Widow? https://t.co/UWZLPewsKD https...
9998,Michael B. Jordan Said What?! https://t.co/y4F...
9999,Did ‘Sesame Street’ Go Too Far With Its Episod...
10000,Close Fucking Call: This Product That Got A Ne...


In [29]:
train_x = real[:8000]
train_y = satire[:8000]

test_x = real[-2000:]
test_y = satire[-2000:]

In [46]:
real.columns = ['Tweets']
satire.columns = ['Tweets']

In [54]:
real['Authenticity'] = 'real'
satire['Authenticity'] = 'satire'
df = pd.concat([real, satire])

In [56]:
df

Unnamed: 0,Tweets,x,y,Authenticity
0,Nick Cannon is getting some rest after coming ...,-0.978836,-0.143123,real
1,Team USA eliminated from the World Cup after a...,1.521023,0.056310,real
2,A man has been arrested and charged with murde...,1.675625,-0.050077,real
3,Right-wing conspiracy theorist Alex Jones file...,-0.458398,0.702156,real
4,"America added a robust 263,000 jobs last month...",1.077831,0.476888,real
...,...,...,...,...
9997,Are You A Widow? https://t.co/UWZLPewsKD https...,-0.850005,0.144320,satire
9998,Michael B. Jordan Said What?! https://t.co/y4F...,-0.486319,-1.135245,satire
9999,Did ‘Sesame Street’ Go Too Far With Its Episod...,-0.046387,0.584173,satire
10000,Close Fucking Call: This Product That Got A Ne...,0.008049,0.539663,satire


In [59]:
train, test = train_test_split(df, test_size = 0.2, random_state = 42)

### Creating/Running MultinomialNB Model

In [65]:
mnb = MultinomialNB()
vectorizer = TfidfVectorizer(stop_words = 'english')

pipe = Pipeline([('vectorizer', vectorizer), ('nb', mnb)])
pipe.fit(train.Tweets, train.Authenticity)

Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('nb', MultinomialNB())])

In [66]:
predictions = pipe.predict(test.Tweets)
print('accuracy: ', accuracy_score(test['Authenticity'], predictions))
print('confusion matrix: ', confusion_matrix(test['Authenticity'], predictions))

accuracy:  0.9087728067983004
confusion matrix:  [[1925   91]
 [ 274 1711]]


### As you can see, this model has an accuracy of 90.8%. Although we might be able to do better, let's test it out and see what happens:

In [67]:
headline = np.array([["He Must Really Suck: This Kid Just Got Cut From The JV Basketball Team To Make A Spot For Skyler Gisondo"]])
predictions = pipe.predict(headline[0])

print(predictions)

['satire']


### True! This article is from [another](https://clickhole.com/he-must-really-suck-this-kid-just-got-cut-from-the-jv-basketball-team-to-make-a-spot-for-skyler-gisondo/) satire news site.

In [68]:
joblib.dump(pipe, '/Users/lucashawranke/Documents/data-science/sarcasm-scanner/model.pkl') #mac
# joblib.dump(pipe, '/Users/lucas/data-science/sarcasm-scanner/model.pkl') #pc

['/Users/lucashawranke/Documents/data-science/sarcasm-scanner/model.pkl']

In [None]:
#mac
# with open('/Users/lucashawranke/Documents/data-science/sarcasm-scanner/model.pkl', 'wb') as files:
#     pickle.dump(pipe, files)
#pc
# with open('/Users/lucas/data-science/sarcasm-scanner/model.pkl', 'wb') as files:
#     pickle.dump(pipe, files)

In [70]:
svc = LinearSVC()
vectorizer = TfidfVectorizer(stop_words = 'english')

pipe = Pipeline([('vectorizer', vectorizer), ('svc', svc)])
pipe.fit(train.Tweets, train.Authenticity)
predictions = pipe.predict(test.Tweets)
print('accuracy: ', accuracy_score(test['Authenticity'], predictions))
print('confusion matrix:', confusion_matrix(test['Authenticity'], predictions))

accuracy:  0.9492626843289178
confusion matrix: [[1890  126]
 [  77 1908]]


### Saving LinearSVC Model

In [71]:
joblib.dump(pipe, '/Users/lucashawranke/documents/data-science/sarcasm-scanner/model2.pkl') #mac
# joblib.dump(pipe, '/Users/lucas/data-science/sarcasm-scanner/model2.pkl') #pc

['/Users/lucashawranke/documents/data-science/sarcasm-scanner/model2.pkl']