In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False
...,...,...
199995,Conor maynard seamlessly fits old-school r&b h...,False
199996,How to you make holy water? you boil the hell ...,True
199997,How many optometrists does it take to screw in...,True
199998,Mcdonald's will officially kick off all-day br...,False


In [4]:
sample = df.sample(1000)

In [5]:
sample

Unnamed: 0,text,humor
141593,The changing face of china's military: changes...,False
77543,Probably too soon news is robin williams didn'...,True
39358,"Oil embargoes, sherlock holmes, and the russia...",False
167976,How do you make 3 pounds of fat attractive? pu...,True
180677,How do you make a lemon orgasm? you rub its ci...,True
...,...,...
58100,Cher & kathy griffin: 'don't let mitt turn bac...,False
29031,The terrible thought i had during my postpartu...,False
52114,"Gps guide: katie barberi, actress, finds her c...",False
171664,Pope francis arrives in myanmar to navigate di...,False


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    200000 non-null  object
 1   humor   200000 non-null  bool  
dtypes: bool(1), object(1)
memory usage: 1.7+ MB


In [54]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [55]:
def preprocessing(data):
    data = re.sub('[^A-Za-z]',' ', data)   #remove non-alphabetic
    data = data.lower()        #make it lower-case
    wt = word_tokenize(data)
    wt = [word for word in wt if word not in stopwords.words('english')]   #remove stopwords
    return ' '.join(wt)

In [56]:
preprocessing('this is my email MRUNMAYEE123@gmail.com')

'email mrunmayee gmail com'

In [57]:
sample['text'] = sample['text'].apply(preprocessing)

In [58]:
sample['text']

141593    changing face china military changes pla milit...
77543     probably soon news robin williams commit suici...
39358          oil embargoes sherlock holmes russian butler
167976                make pounds fat attractive put nipple
180677                        make lemon orgasm rub cituris
                                ...                        
58100     cher kathy griffin let mitt turn back time wom...
29031                terrible thought postpartum depression
52114     gps guide katie barberi actress finds center m...
171664    pope francis arrives myanmar navigate diplomat...
104384    knock knock bill bill bill bill worst fucking ...
Name: text, Length: 1000, dtype: object

In [59]:
corpus = []

In [60]:
for sentence in sample['text']:
    corpus.append(sentence)

In [61]:
corpus

['changing face china military changes pla military diplomacy xi jinping',
 'probably soon news robin williams commit suicide apparently ru fi',
 'oil embargoes sherlock holmes russian butler',
 'make pounds fat attractive put nipple',
 'make lemon orgasm rub cituris',
 'chicken cross wife ran street',
 'call sliding home plate green field mexico muchas grassy ass sorry',
 'hillary clinton hits trump administration approach lgbtq issues',
 'surprising ways add color home',
 'beat sister scoping place weeks',
 'difference snow man snow woman snowballs',
 'tonight gon na smoke herb nice guys work crematorium',
 'paul krugman debunked reinhart rogoff paper lot damage video',
 'chinese space station adrift years plummets earth',
 'know rinat akhmetshin lobbyist met donald trump jr',
 'take virtual ride world highest water slide',
 'affordable cities living budget according apartment guide infographic',
 'ken jeong says abc paving way asian visibility tv',
 'michelle obama proudest achievem

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [63]:
tv_model = TfidfVectorizer()

In [64]:
vectors = tv_model.fit_transform(corpus).toarray()

In [65]:
vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [66]:
X = vectors

In [67]:
sample['humor'] = sample['humor'].astype(int)

In [68]:
y = sample['humor'].replace({'True' : 1, 'False':0})

In [69]:
y

141593    0
77543     1
39358     0
167976    1
180677    1
         ..
58100     0
29031     0
52114     0
171664    0
104384    1
Name: humor, Length: 1000, dtype: int32

In [70]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [71]:
from sklearn.model_selection import train_test_split

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

In [73]:
print(f'X_train:{X_train.shape}, X_test:{X_test.shape}, y_train:{y_train.shape}, y_test:{y_test.shape}')

X_train:(800, 3763), X_test:(200, 3763), y_train:(800,), y_test:(200,)


In [74]:
from sklearn.naive_bayes import MultinomialNB

In [75]:
MNB_model = MultinomialNB()

In [76]:
MNB_model.fit(X_train, y_train)

MultinomialNB()

In [77]:
y_pred = MNB_model.predict(X_test)

In [78]:
y_pred

array([0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 0])

In [79]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [80]:
accuracy_score(y_test, y_pred)

0.795

In [81]:
string = "Why don't scientists trust atoms? Because they make up everything!"

In [82]:
string = preprocessing(string)

In [83]:
string

'scientists trust atoms make everything'

In [84]:
vect = tv_model.transform([string]).toarray()

In [85]:
pred = MNB_model.predict(vect)

In [86]:
pred

array([1])

In [93]:
import pickle

In [94]:
with open('model.pkl', 'wb') as file:
    pickle.dump(MNB_model,file)

In [95]:
with open('model.pkl','rb') as file:
    MNB = pickle.load(file)

In [98]:
MNB

MultinomialNB()