In [1]:
import pandas as pd
import numpy as np
import re
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB

Our dataset has three columns, where the
article_link (type: Object): contains links to the news articles.
headline (type: Object): contains headlines of the news articles.
is_sarcastic (type: int64): contains 0(for nonsarcastic text) and 1(for sarcastic text).

In [26]:
data = pd.read_json("/home/mona/Downloads/Sarcasm.json", lines=True)
print(data.head())

                                        article_link  \
0  https://www.huffingtonpost.com/entry/versace-b...   
1  https://www.huffingtonpost.com/entry/roseanne-...   
2  https://local.theonion.com/mom-starting-to-fea...   
3  https://politics.theonion.com/boehner-just-wan...   
4  https://www.huffingtonpost.com/entry/jk-rowlin...   

                                            headline  is_sarcastic  
0  former versace store clerk sues over secret 'b...             0  
1  the 'roseanne' revival catches up to our thorn...             0  
2  mom starting to fear son's web series closest ...             1  
3  boehner just wants wife to listen, not come up...             1  
4  j.k. rowling wishes snape happy birthday in th...             0  


In [3]:
print(data.isnull().any(axis = 0))

article_link    False
headline        False
is_sarcastic    False
dtype: bool


In [4]:
#data["is_sarcastic"] = data["is_sarcastic"].map({0: "Not Sarcasm", 1: "Sarcasm"})
#print(data.head())

In [52]:
# Relacing special symbols and digits in headline column
# re stands for Regular Expression
data['headline'] = data['headline'].apply(lambda s : re.sub('[^a-zA-Z]', ' ', s))

In [67]:
data = data[["headline", "is_sarcastic"]]
#features = np.array(data["headline"])
#target = np.array(data["is_sarcastic"])
features = data["headline"]
target = data["is_sarcastic"]

In [68]:
# Stemming our data
ps = PorterStemmer()
features = features.apply(lambda x: x.split())
features = features.apply(lambda x : ' '.join([ps.stem(word) for word in x]))

In [69]:
# vectorizing the data with maximum of 5000 features
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features = 5000)
features = list(features)
features = tv.fit_transform(features).toarray()

In [61]:
print(features.shape)

(26709, 5000)


In [9]:
# getting training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, target, test_size = .05, random_state = 0)

In [10]:
# model 1:-
# Using linear support vector classifier
lsvc = LinearSVC()
# training the model
lsvc.fit(features_train, labels_train)
# getting the score of train and test data
print(lsvc.score(features_train, labels_train)) # 90.93
print(lsvc.score(features_test, labels_test))   # 83.75

0.9093524612777362
0.8375748502994012


In [11]:
# model 2:-
# Using Gaussuan Naive Bayes
gnb = GaussianNB()
gnb.fit(features_train, labels_train)
print(gnb.score(features_train, labels_train))  # 78.86
print(gnb.score(features_test, labels_test))    # 73.80

0.7886335868836952
0.7380239520958084


In [12]:
# model 3:-
# Logistic Regression
lr = LogisticRegression()
lr.fit(features_train, labels_train)
print(lr.score(features_train, labels_train))   # 88.16
print(lr.score(features_test, labels_test))     # 83.08

0.8816458440074094
0.8308383233532934


In [13]:
# model 2:-
# Using Bernoulli Naive Bayes
bnb = BernoulliNB()
bnb.fit(features_train, labels_train)
print(bnb.score(features_train, labels_train))  # 78.86
print(bnb.score(features_test, labels_test))    # 73.80

0.8728175619753281
0.8390718562874252


In [71]:
data["is_sarcastic"] = data["is_sarcastic"].map({0: "Not Sarcasm", 1: "Sarcasm"})
print(data.head())

                                            headline is_sarcastic
0  former versace store clerk sues over secret  b...  Not Sarcasm
1  the  roseanne  revival catches up to our thorn...  Not Sarcasm
2  mom starting to fear son s web series closest ...      Sarcasm
3  boehner just wants wife to listen  not come up...      Sarcasm
4  j k  rowling wishes snape happy birthday in th...  Not Sarcasm


In [72]:
data = data[["headline", "is_sarcastic"]]
x = np.array(data["headline"])
y = np.array(data["is_sarcastic"])

cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [73]:
model = LinearSVC()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.8193560464245601


In [77]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = model.predict(data)
print(output)

Enter a Text: I work 40 hours a week for me to be this poor.
['Sarcasm']
