<a href="https://colab.research.google.com/github/modhudeb/SarcasmPredictionAndPipeline/blob/main/sarcasmPrediction_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.metrics import classification_report

### downloads

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# EDA

In [None]:
data = pd.read_csv('sarcasm_short.csv', index_col='ID')

In [None]:
df =  data.copy()

In [None]:
df.head(3)

Unnamed: 0_level_0,is_sarcastic,headline,article_link
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,freshman dorm kept cool by 870 fans,https://local.theonion.com/freshman-dorm-kept-...
1,1,father spends joyful afternoon throwing son ar...,https://local.theonion.com/father-spends-joyfu...
2,1,area woman fulfills dream of becoming writer b...,https://local.theonion.com/area-woman-fulfills...


In [None]:
df.drop('article_link', axis =1, inplace = True)

In [None]:
df['is_sarcastic'].value_counts()

1    8634
0    8634
Name: is_sarcastic, dtype: int64

In [None]:
df

Unnamed: 0_level_0,is_sarcastic,headline
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,freshman dorm kept cool by 870 fans
1,1,father spends joyful afternoon throwing son ar...
2,1,area woman fulfills dream of becoming writer b...
3,1,"report: only 47,000 social justice milestones ..."
4,1,last living tamagotchi dies in captivity
...,...,...
17263,1,jews to celebrate rosh hashasha or something
17264,1,internal affairs investigator disappointed con...
17265,0,the most beautiful acceptance speech this week...
17266,1,mars probe destroyed by orbiting spielberg-gat...


## preprocessing

In [None]:
# tokenizzation

tokens = list(map(word_tokenize, df['headline']))

In [None]:
# stopwords filtering

stop_words = stopwords.words('english')
filt_tokens = [[word for word in arr if word not in stop_words] for arr in tokens]

In [None]:
# stemming 

lemm = WordNetLemmatizer()
filt_tokens = [[lemm.lemmatize(word) for word in arr if word not in stop_words] for arr in filt_tokens]

In [None]:
# transforming to "sentence" form
lines = [" ".join(arr) for arr in filt_tokens]

In [None]:
lines[1:4]

['father spends joyful afternoon throwing son around backyard',
 'area woman fulfills dream becoming writer getting job bookstore',
 'report : 47,000 social justice milestone go u.s. achieves full equality']

In [None]:
# Vectorizing : tf-idf

vectorizer = TfidfVectorizer()
vects = vectorizer.fit_transform(lines)
vects = vects.toarray()
vects

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
vects.shape

(17268, 18650)

In [None]:
vectorizer.get_feature_names_out().shape

(18650,)

now we got df.headline column's vector form that will help in prediction

## Splitting

In [None]:
features = vects
target = df['is_sarcastic']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(features, target, test_size=0.4, random_state=42)

# Algorithm

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(xtrain, ytrain)
ypred = gnb.predict(xtest)

In [None]:
rep = classification_report(ytest, ypred)

In [None]:
print(rep)

              precision    recall  f1-score   support

           0       0.62      0.80      0.70      3437
           1       0.72      0.52      0.60      3471

    accuracy                           0.66      6908
   macro avg       0.67      0.66      0.65      6908
weighted avg       0.67      0.66      0.65      6908



## RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

The following parameters of RandomForestClassifier has found through little research.

In [None]:
rfs = RandomForestClassifier(n_estimators=200, min_samples_split=9, min_samples_leaf=6, 
                             max_depth=60,verbose=1,random_state=42)

In [None]:
rfs.fit(xtrain,ytrain)
ypred = rfs.predict(xtrain)
rep = classification_report(ytrain, ypred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    1.8s finished


In [None]:
print(rep)

              precision    recall  f1-score   support

           0       0.77      0.86      0.81      5197
           1       0.84      0.74      0.78      5163

    accuracy                           0.80     10360
   macro avg       0.80      0.80      0.80     10360
weighted avg       0.80      0.80      0.80     10360



so we can see randomforest clssifier is working best

# Pipeline

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

Creating transformers

In [None]:
class PrePros(BaseEstimator, TransformerMixin):
  arr = ['dhuru']

  def __init__(self, columns = None):
    self.columns = columns


  def fit(self, X, y=None):  
    return X


  def transform(self,X):
    """This function will tokenize, check stopwords, lemmatize a.
        This function takes unfiltered text"""

    oldCols = self.arr  # for prediction part
    temp = []           # for prediction part
    giveIt = []         # for prediction part

    if type(X) is str :
      seText = [X].copy()   # to predict one sentence's result
    else :
      seText = X.copy()

    tokenz = list(map(word_tokenize, seText))
    stop_wordz = stopwords.words('english')
    filt_tokenz = [[word for word in arr if word not in stop_wordz] for arr in tokenz]
    lem = WordNetLemmatizer()
    filt_tokenz = [[lem.lemmatize(word) for word in arr if word not in stop_wordz] for arr in filt_tokenz]
    linez = [" ".join(arr) for arr in filt_tokenz]
    vectorizer = TfidfVectorizer()
    vect = vectorizer.fit_transform(linez)
    vect = vect.toarray()

    if self.arr[0]=='dhuru' :  # till this figure, it will work only for "transform" function
      self.arr = vectorizer.get_feature_names_out()
      return vect       

    else:                     # Now this part will work only for prediction
      newCols = vectorizer.get_feature_names_out()

      print(oldCols)    # for privilege
      print(newCols)    # for privilege
      
      # this for loop part will -convert- enterred string's vector form -to- trained model's -demanding- form
      for ns in range(vect.shape[0]) :   # vect.shape[0] --it will give number of sentence passed in prediction
        for nme in oldCols:
          count = 0
          for ind, val in enumerate(newCols):
            if val == nme :
              temp.append(vect[ns][ind])
              count = 1
              break

          if count == 0:
            temp.append(0.00)
            
        giveIt.append(temp) # transforming to 2d array
      return giveIt


  def fit_transform(self,X,y=None):
    vect  = self.transform(X)
    return vect  

Randomforest estimator-custom

In [None]:
class Esti_mator(PrePros,BaseEstimator,TransformerMixin):  

  def __init__(self):
    self.RFS = RandomForestClassifier()

  def fit(self,X,y=None):
    self.RFS = RandomForestClassifier(n_estimators=200, min_samples_split=6, min_samples_leaf=4, 
                             max_depth=40,verbose=1,random_state=42)
    self.RFS.fit(X,y)
    return self.RFS
    
  def transform(self,X):
    self.est = X
    return self.est
  
  def fit_transform(self,X,y=None):
    self.rfs = self.fit(X,y)
    return self.transform(rfs)

  def predict(self,X):    
    print("pred---",X)
    pred = self.RFS.predict(X)
    return pred

### importing pipeline

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
ppl = Pipeline(steps=[('PreProcessing', PrePros()),
                      ('Best_Estimator', Esti_mator())])

In [None]:
# ppl.fit(df['headline'][:1000], df['is_sarcastic'][:1000])
ppl.fit(df['headline'], df['is_sarcastic'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  2.9min finished


Pipeline(steps=[('PreProcessing', PrePros()), ('Best_Estimator', Esti_mator())])

In [None]:
ypred = ppl.predict("zuckerberg is an alien")

['000' '000th' '03' ... 'zuckerberg' 'zz' 'éclairs']
['alien' 'zuckerberg']
pred--- [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished


In [None]:
print('Saracstic') if ypred == 1 else print("Sarcastic naa")

Saracstic


In [None]:
# testing with 2475 no. text of given data
ypred = ppl.predict("rubio launches new lines of attack against christie")

['000' '000th' '03' ... 'zuckerberg' 'zz' 'éclairs']
['attack' 'christie' 'launch' 'line' 'new' 'rubio']
pred--- [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished


In [None]:
print('Saracstic') if ypred == 1 else print("Sarcastic naa")

Sarcastic naa
