In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re
import nltk
from nltk import word_tokenize 
from nltk.util import ngrams
from nltk import bigrams
from tqdm import tqdm
import string  
import unicodedata
from nltk.corpus import stopwords
# add appropriate words that will be ignored in the analysis
ADDITIONAL_STOPWORDS = ['covfefe']
import spacy
nlp = spacy.load('en_core_web_sm')
stopword = nltk.corpus.stopwords.words('english')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize
from scipy.sparse import csr_matrix
from nltk import pos_tag


In [2]:
with open('intermediate_files/train_data_for_amazon_sarcasm_detection.pkl', "rb") as fh:
    dft = pickle.load(fh)
with open('intermediate_files/test_data_for_amazon_sarcasm_detection.pkl', "rb") as fh:
    dftest = pickle.load(fh)

# Training Part

## Bigram, Trigram 

In [3]:
def extract_pos(dft):
    
    pos_tag_list=[]
    for doc in tqdm(dft['review']):
        sents=sent_tokenize(str(doc))
        pos_doc=[]
        for sent in sents:
            sent=sent.lower()
            sent=re.sub(r"[^A-Za-z ]",'',sent)
            words=word_tokenize(sent)
            for word in words:
                pos_label=pos_tag(word_tokenize(word))[0][1]
                pos_doc.append(pos_label)
        joined=' '.join(pos_doc)
        pos_tag_list.append(joined)
    dft['pos_tags']=pos_tag_list
    
    return dft

In [4]:
def frequency_counter(df,word_gram,pos):
    
    dft=df.copy()
    c_vec = CountVectorizer(ngram_range=(word_gram,word_gram))

    if pos:
        all_pos_tags=[]
        for doc in tqdm(dft['pos_tags']):
            all_pos_tags.append(doc)
        ngrams = c_vec.fit_transform(all_pos_tags)

    else:
        all_sentences=[]
        for doc in dft['review']:
            sents=sent_tokenize(str(doc))
            for sent in sents:
                all_sentences.append(sent)
        sentdf=pd.DataFrame(all_sentences)
        sentdf.columns=['sentences']
        ngrams = c_vec.fit_transform(sentdf['sentences'])

    
    # matrix of ngrams
    A_sub1 = csr_matrix(ngrams[:, :].sum(axis=0))
    counts=A_sub1.toarray()
    counts=counts.squeeze()
    vocab = c_vec.vocabulary_
    df_ngram = pd.DataFrame(sorted([(counts[i],k) for k,i in vocab.items()], reverse=True)).rename(columns={0: 'frequency', 1:'ngram'})
    # df_ngram.head()

    __freq={}
    for key, fr in zip(df_ngram['ngram'],df_ngram['frequency']):
        __freq[key]=fr
    return __freq

def calc_word_gram_sets_and_scores(df,word_gram,pos=False):
    # freq of all bigram
    dfs=df.loc[df['label']==1]
    dfns=df.loc[df['label']==0]
    if pos:
        all_freq=frequency_counter(df,word_gram,pos)
        sarc_freq=frequency_counter(dfs,word_gram,pos)
        non_sarc_freq=frequency_counter(dfns,word_gram,pos)
        required='pos_tags'
    else:
        all_freq=frequency_counter(df,word_gram,pos)
        sarc_freq=frequency_counter(dfs,word_gram,pos)
        non_sarc_freq=frequency_counter(dfns,word_gram,pos)
        required='review'

    word_gram_set_list=[]
    word_gram_score=[]

    for doc,label in tqdm(zip(df[required],df['label'])):
        c_vec = CountVectorizer(ngram_range=(word_gram,word_gram))
        bidoc_set=[]
        tf_doc_sum=0
        tf_all_sum=0
        if pos:
            doc_sentences=[doc]
        else:
            sents=sent_tokenize(str(doc))
            
            doc_sentences=[]
            for sent in sents:
                sent=sent.lower()
                sent=re.sub(r"[^A-Za-z ]",'',sent)
                doc_sentences.append(sent)

        
        ngrams = c_vec.fit_transform(doc_sentences)
        vocab = c_vec.vocabulary_
        for big,i in vocab.items():
            # big=big[0]+' '+big[1]
            bidoc_set.append(big)
            if label==1:
                try:
                    tf_doc_sum+=sarc_freq[big]
                    tf_all_sum+=all_freq[big]
                except:
                    tf_all_sum+=0
                    tf_all_sum+=0
            else:
                try:
                    tf_doc_sum+=non_sarc_freq[big]
                    tf_all_sum+=all_freq[big]
                except:
                    tf_all_sum+=0
                    tf_all_sum+=0
        word_gram_set_list.append(bidoc_set)
        word_gram_score.append(tf_doc_sum/tf_all_sum)
    return word_gram_set_list,word_gram_score

In [5]:
def bi_tri_df(dft):
    dft['bigram_set'],dft['bigram_wt']=calc_word_gram_sets_and_scores(dft,2)
    dft['trigram_set'],dft['trigram_wt']=calc_word_gram_sets_and_scores(dft,3)

    #POS
    df_pos = extract_pos(dft)
    dft['bigram_posSet'],dft['bigram_posWt']=calc_word_gram_sets_and_scores(df_pos,2,pos=True)
    dft['trigram_posSet'],dft['trigram_posWt']=calc_word_gram_sets_and_scores(df_pos,3,pos=True)
    dft.drop('pos_tags',inplace=True,axis=1)
    
    return dft

In [6]:
wt_bi_tri = bi_tri_df(dft)

652it [00:01, 467.10it/s]
652it [00:01, 497.66it/s]
100%|██████████| 652/652 [00:43<00:00, 14.89it/s]
100%|██████████| 652/652 [00:00<00:00, 115091.38it/s]
100%|██████████| 326/326 [00:00<00:00, 340135.10it/s]
100%|██████████| 326/326 [00:00<00:00, 123284.02it/s]
652it [00:00, 924.07it/s]
100%|██████████| 652/652 [00:00<00:00, 755229.55it/s]
100%|██████████| 326/326 [00:00<00:00, 641342.92it/s]
100%|██████████| 326/326 [00:00<00:00, 435876.03it/s]
652it [00:00, 767.15it/s] 


In [7]:
wt_bi_tri

Unnamed: 0,review,rating,label,bigram_set,bigram_wt,trigram_set,trigram_wt,bigram_posSet,bigram_posWt,trigram_posSet,trigram_posWt
0,I simply had no idea how bad it is! I am a sta...,5.0,0,"[simply had, had no, no idea, idea how, how ba...",0.597990,"[simply had no, had no idea, no idea how, idea...",0.887160,"[nn rb, rb vbd, vbd dt, dt nn, nn wrb, wrb jj,...",0.530869,"[nn rb vbd, rb vbd dt, vbd dt nn, dt nn wrb, n...",0.534252
1,A searing indictment... This book is sure to b...,5.0,0,"[searing indictment, this book, book is, is su...",0.572722,"[this book is, book is sure, is sure to, sure ...",0.792049,"[dt vbg, vbg nn, nn dt, dt nn, nn vbz, vbz nn,...",0.530087,"[dt vbg nn, vbg nn dt, nn dt nn, dt nn vbz, nn...",0.535295
2,Another movie to ignore.... A perfect date mov...,1.0,1,"[another movie, movie to, to ignore, ignore pe...",0.483996,"[another movie to, movie to ignore, to ignore ...",0.914286,"[dt nn, nn to, to nn, nn dt, nn nn, nn rb, rb ...",0.470786,"[dt nn to, nn to nn, to nn dt, nn dt nn, dt nn...",0.476395
3,Buy this phone !! I got my Droid Incredible in...,5.0,0,"[buy this, this phone, got my, my droid, droid...",0.593645,"[buy this phone, got my droid, my droid incred...",0.881159,"[vb dt, dt nn, nn nn, nn vbd, vbd prp, prp nn,...",0.531483,"[vb dt nn, dt nn nn, nn nn vbd, nn vbd prp, vb...",0.537226
4,Mother & daughter So far this is pretty boring...,3.0,0,"[mother daughter, daughter so, so far, far thi...",0.591287,"[mother daughter so, daughter so far, so far t...",0.911765,"[nn nn, nn rb, rb rb, rb dt, dt vbz, vbz rb, r...",0.535552,"[nn nn rb, nn rb rb, rb rb dt, rb dt vbz, dt v...",0.536632
...,...,...,...,...,...,...,...,...,...,...,...
647,Plays tracks in alphanumeric order This is a g...,1.0,0,"[plays tracks, tracks in, in alphanumeric, alp...",0.581565,"[plays tracks in, tracks in alphanumeric, in a...",0.738532,"[nns nns, nns in, in nn, nn nn, nn dt, dt vbz,...",0.531140,"[nns nns in, nns in nn, in nn nn, nn nn dt, nn...",0.534347
648,be very CAREFUL!!!! This is good phone. It is ...,1.0,1,"[be very, very careful, this is, is good, good...",0.434718,"[be very careful, this is good, is good phone,...",0.815642,"[vb rb, rb nn, nn dt, dt vbz, vbz jj, jj nn, n...",0.468014,"[vb rb nn, rb nn dt, nn dt vbz, dt vbz jj, vbz...",0.465452
649,Really like this movie and its sequeal 10.5 ap...,5.0,0,"[really like, like this, this movie, movie and...",0.582956,"[really like this, like this movie, this movie...",0.684783,"[rb in, in dt, dt nn, nn cc, cc prp, prp nn, n...",0.532343,"[rb in dt, in dt nn, dt nn cc, nn cc prp, cc p...",0.539769
650,Early Album of 2010 Candidate I open this revi...,5.0,0,"[early album, album of, of candidate, candidat...",0.610280,"[early album of, album of candidate, of candid...",0.856511,"[rb nn, nn in, in nn, nn nn, nn jj, jj dt, dt ...",0.531486,"[rb nn in, nn in nn, in nn nn, nn nn jj, nn jj...",0.536329


In [8]:
wt_bi_tri.to_pickle('intermediate_files/Train_bi_tri_wt.pkl')

# Testing Part

## Bigram, Trigram Set

In [9]:
def calc_word_gram_sets_test(df,word_gram,pos=False):
    # freq of all bigram

    if pos:

        required='pos_tags'
    else:

        required='review'

    word_gram_set_list=[]

    for doc,label in tqdm(zip(df[required],df['label'])):
        c_vec = CountVectorizer(ngram_range=(word_gram,word_gram))
        bidoc_set=[]

        if pos:
            doc_sentences=[doc]
        else:
            sents=sent_tokenize(str(doc))
            
            doc_sentences=[]
            for sent in sents:
                sent=sent.lower()
                sent=re.sub(r"[^A-Za-z ]",'',sent)
                doc_sentences.append(sent)
                
        ngrams = c_vec.fit_transform(doc_sentences)
        vocab = c_vec.vocabulary_
        for big,i in vocab.items():
            # big=big[0]+' '+big[1]
            bidoc_set.append(big)

        word_gram_set_list.append(bidoc_set)

    return word_gram_set_list

In [10]:
def test_bi_tri(dftest):
    dftest['bigram_set']=calc_word_gram_sets_test(dftest,2)
    dftest['trigram_set']=calc_word_gram_sets_test(dftest,3)

    df_pos = extract_pos(dftest)
    dftest['bigram_posSet']=calc_word_gram_sets_test(df_pos,2,pos=True)
    dftest['trigram_posSet']=calc_word_gram_sets_test(df_pos,3,pos=True)
    
    dftest.drop('pos_tags',inplace=True,axis=1)
    
    return dftest

In [11]:
df_test= test_bi_tri(dftest)

218it [00:00, 317.14it/s]
218it [00:00, 365.98it/s]
100%|██████████| 218/218 [00:20<00:00, 10.45it/s]
218it [00:00, 789.58it/s]
218it [00:00, 694.75it/s]


In [12]:
dftest.to_pickle('intermediate_files/Test_bi_tri.pkl')

In [13]:
dftest

Unnamed: 0,review,rating,label,bigram_set,trigram_set,bigram_posSet,trigram_posSet
0,"Great Alyssa, but rather boring movie This is ...",2.0,0,"[great alyssa, alyssa but, but rather, rather ...","[great alyssa but, alyssa but rather, but rath...","[jj nn, nn cc, cc rb, rb nn, nn nn, nn dt, dt ...","[jj nn cc, nn cc rb, cc rb nn, rb nn nn, nn nn..."
1,worst ever If this was written by Danielle Ste...,1.0,1,"[worst ever, ever if, if this, this was, was w...","[worst ever if, ever if this, if this was, thi...","[jjs rb, rb in, in dt, dt vbd, vbd vbn, vbn in...","[jjs rb in, rb in dt, in dt vbd, dt vbd vbn, v..."
2,Hasselhoff Me! Please! One was having a parti...,5.0,1,"[hasselhoff me, one was, was having, having pa...","[one was having, was having particularly, havi...","[nn prp, prp nn, nn cd, cd vbd, vbd vbg, vbg d...","[nn prp nn, prp nn cd, nn cd vbd, cd vbd vbg, ..."
3,"One Friday, Without the Milk He always brought...",3.0,1,"[one friday, friday without, without the, the ...","[one friday without, friday without the, witho...","[cd nn, nn in, in dt, dt nn, nn prp, prp rb, r...","[cd nn in, nn in dt, in dt nn, dt nn prp, nn p..."
4,"this. sucked. bad. one word: punctuation. ok, ...",1.0,1,"[one word, word punctuation, ok folks, folks h...","[one word punctuation, ok folks heres, folks h...","[dt vbn, vbn jj, jj cd, cd nn, nn nn, nn nns, ...","[dt vbn jj, vbn jj cd, jj cd nn, cd nn nn, nn ..."
...,...,...,...,...,...,...,...
213,"If dragons could wear t-shirts, this is the on...",5.0,1,"[if dragons, dragons could, could wear, wear t...","[if dragons could, dragons could wear, could w...","[in nns, nns md, md nn, nn nns, nns dt, dt vbz...","[in nns md, nns md nn, md nn nns, nn nns dt, n..."
214,Horrible Product this product only got one sta...,1.0,1,"[horrible product, product this, this product,...","[horrible product this, product this product, ...","[jj nn, nn dt, dt nn, nn rb, rb vbd, vbd cd, c...","[jj nn dt, nn dt nn, dt nn rb, nn rb vbd, rb v..."
215,BDP-S560 vs Panasonic DMP-BD80 There seems to ...,5.0,0,"[bdps vs, vs panasonic, panasonic dmpbd, dmpbd...","[bdps vs panasonic, vs panasonic dmpbd, panaso...","[nn nn, nn rb, rb vbz, vbz to, to vb, vb dt, d...","[nn nn nn, nn nn rb, nn rb vbz, rb vbz to, vbz..."
216,Great Gift... I know nothing about guitars so ...,4.0,0,"[great gift, know nothing, nothing about, abou...","[know nothing about, nothing about guitars, ab...","[jj nn, nn nn, nn vb, vb nn, nn in, in nns, nn...","[jj nn nn, nn nn vb, nn vb nn, vb nn in, nn in..."
