In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
###
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
####
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
#####
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Creation of confusion matrix in using sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix


## **Data Extraction and Preprocessing**

In [None]:
df_reviews = pd.read_json('../input/imdb-spoiler-dataset/IMDB_reviews.json', lines=True)
df_details = pd.read_json('../input/imdb-spoiler-dataset/IMDB_movie_details.json',lines =True)

In [None]:
print(df_reviews.shape)
print(df_details.shape)

In [None]:
df_reviews.head()                                                                                                                                                                                                                                                                                                                            

In [None]:
df_details.head()

In [None]:
df_reviews["is_spoiler"].value_counts().plot(kind= "bar")
plt.show()

In [None]:
##### Extract the information from the data
spoiler_df = pd.DataFrame()
spoiler_df["is_spoiler"] = df_reviews["is_spoiler"] 
spoiler_df["has_a_word_spoiler"] = df_reviews["review_text"].apply(lambda 
                                                                  text : True if "spoiler" in text 
                                                                  else False)

pie1 = spoiler_df['is_spoiler'].value_counts().reset_index().sort_values(by='index')
pie2 = spoiler_df["has_a_word_spoiler"].value_counts().reset_index().sort_values(by='index')

with plt.style.context('seaborn-talk'):
    fig = plt.figure(figsize=(16, 8))

    ax1 = fig.add_subplot(1, 2, 1)
    ax2 = fig.add_subplot(1, 2, 2)

    ax1.pie(pie1['is_spoiler'])
    ax1.set_title('All reviews')

    ax2.pie(pie2['has_a_word_spoiler'])
    ax2.set_title('Reviews containing the word \'spoiler\'')

    plt.suptitle('Spoiler distribution within the reviews', fontsize=20)
    fig.legend(labels=['Without spoilers(False)', 'With spoilers(True)'], loc='center')

    plt.show()

## **Picked a Small Batch of data to choose a best model for classification**

In [None]:
def get_part_data(df, num_reviews):
    
    num_pos_reviews = df[df["is_spoiler"]== True].shape[0]  # Number of spoilers in the dataset
    num_neg_reviews = df[df["is_spoiler"]== False].shape[0] # Number of Non-spoilers in the dataset
    
    fraction_pos = num_reviews/num_pos_reviews  # fraction of spoiler reviews to be returned
    fraction_neg = num_reviews/num_neg_reviews  # fraction of non-spoiler reviews to be returned 

    df_pos = df[df['is_spoiler'] == True].sample(frac = fraction_pos, random_state = 2)
    df_neg = df[df['is_spoiler'] == False].sample(frac = fraction_neg, random_state = 2)

    df_re = pd.concat([df_pos, df_neg])  # join the True and False dataset
    df_re = df_re.reset_index(drop=True)  # mix the index values
    df_re.loc[(df_re['is_spoiler'] == True) ,'is_spoiler'] =1
    df_re.loc[(df_re['is_spoiler'] == False) ,'is_spoiler'] =0
    return df_re[["movie_id","review_text","review_summary","is_spoiler"]]

In [None]:
d = get_part_data(df_reviews, 110000)
d["is_spoiler"] = d["is_spoiler"].astype('int')

df_r = d[["review_text","is_spoiler"]]

In [None]:
d.head()

## **Pre-process the reviews by removing the stop words, tokenizing and lemmitization**

In [None]:
import spacy
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [None]:
# lower the words in training data
ltexts = [[word.lower() for word in line.split()] for line in df_r["review_text"]]

In [None]:
def preprocess(text):
    text = ' '.join(text)
    # tokenization and removing stop words
    token = [t for t in nlp(text) if not t.is_stop] 
    # lemmatization
    lemma = [lem.lemma_ for lem in token]
    # removing the non-alphabetic words
    return [' '.join(i for i in lemma if i.isalpha()) ]

In [None]:
print(ltexts[5])

# sample preprocessing for one text in the data
print(preprocess(ltexts[5]))

In [None]:
# proc_text = []
# for i in range(len(ltexts)):
#     proc_text.append(preprocess(ltexts[i]))

In [None]:
#proc_text[5]

## **Base line Dummy classifier**

In [None]:
### Base line Dummy classifier
X_train, X_test, y_train, y_test = train_test_split(df_r["review_text"], df_r["is_spoiler"],
                                                     test_size=0.30, random_state=9)
# Dummy classifier model
clf = DummyClassifier(strategy='stratified')
clf.fit(X_train, y_train)
    
# Predict the train by using dummmy classifier
dummy_train_pred = clf.predict(X_train)
dummy_test_pred  = clf.predict(X_test)
    
print(classification_report(y_train, dummy_train_pred))
print(classification_report(y_test,dummy_test_pred))

## **Naive Bayes Classifier**

In [None]:
is_spoiler = ["True","False"]

### Naive Bayes model to predict the spoilers
def model_pipe(df):
    X_train, X_test, y_train, y_test = train_test_split(df["review_text"], df["is_spoiler"],
                                                        test_size=0.30, random_state=9)
    # pipline the process for text classification
    pipe = Pipeline([("count_vectorizer", CountVectorizer(stop_words="english")),
                     ("tfidf_transformer", TfidfTransformer()),
                     ("nb_classifier", BernoulliNB())])

    # fit the model to the train data
    pipe.fit(X_train, y_train)

    # predict the train values
    train_pred = pipe.predict(X_train)

    # predict the test
    test_pred = pipe.predict(X_test)
    
    cm = confusion_matrix(y_test,test_pred)
    fig,ax = plt.subplots(figsize=(10, 10))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=is_spoiler)
    disp.plot(ax=ax)
    plt.show()

    print(classification_report(y_train, train_pred))
    print(classification_report(y_test, test_pred))
    print("AUC Score")
    print(roc_auc_score(y_test, test_pred))

In [None]:
model_pipe(df_r)

In [None]:
from sklearn.model_selection import GridSearchCV
Gpipe = Pipeline([('count_vect',CountVectorizer(stop_words="english")),
                ('tfidf_transformer',TfidfTransformer()),
                ('nbClassifier',BernoulliNB())])


parameters = {
    'count_vect__binary': [True, False],
    'count_vect__ngram_range': [(1, 1), (1, 2)],
    'nbClassifier__alpha': (1, 0.1),
}

grid_search = GridSearchCV(Gpipe, parameters, cv=5)

In [None]:
#clf = grid_search.fit(X_train, y_train)
#print(grid_search.best_params_)

In [None]:
#naive_pred = grid_search(X_test)
#print(classification_report(y_test,naive_pred))

## **XG Boost Classifier**

In [None]:
import xgboost as xgb

In [None]:
def xgmodel_pipe(df):
    X_train, X_test, y_train, y_test = train_test_split(df["review_text"], df["is_spoiler"],
                                                        test_size=0.30, random_state=9)
    # pipline the process for text classification
    pipe = Pipeline([("count_vectorizer", CountVectorizer(stop_words = "english")),
                     ("tfidf_transformer", TfidfTransformer()),
                     ("xg_classifier", xgb.XGBClassifier(eta = 0.75, objective="binary:logitraw"))])

    # fit the model to the train data
    pipe.fit(X_train, y_train)

    # predict the train values
    train_pred = pipe.predict(X_train)

    # predict the test
    test_pred = pipe.predict(X_test)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test,test_pred)
    fig,ax = plt.subplots(figsize=(10, 10))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=is_spoiler)
    disp.plot(ax=ax)
    plt.show()
    
     # Classification report
    print(classification_report(y_train, train_pred))
    print(classification_report(y_test, test_pred))
    print("AUC Score")
    print(roc_auc_score(y_test, test_pred))

In [None]:
xgmodel_pipe(df_r)

## Word2Vec used for word embedding technique

In [None]:
import spacy
from functools import reduce
import gensim
from gensim.models import Word2Vec
nlp = spacy.load('en_core_web_lg')

In [None]:
data_e = get_part_data(df_reviews, 20000)
data_e["is_spoiler"] = data_e["is_spoiler"].astype('int')

To identify the spolier in the review text. I thought, whether the word2vector using word embedding technique may find the cosine similarity between the **Review_text** and the **Plot summary** of the respective movie.

Therefore, I implemented the Cosine similarity function to check the IMDB review text.

In [None]:
# Join the reviews with the plot summary of respective movie in df_details dataset
reframed_data = data_e.merge(df_details, how='left', on='movie_id')
reframed_data.head()

In [None]:
# train and test spilt for word embedding technique
x_train, x_test, Y_train, Y_test = train_test_split(reframed_data[["review_text","plot_summary"]],reframed_data["is_spoiler"],
                                                    test_size=0.30, random_state=9)

In [None]:
from functools import reduce
def tn(sentence):
    return reduce(lambda x,y: x+y, [nlp.vocab[w].vector for w in sentence.split()])

def transform1(row):
        s1 = tn(row.review_text)
        s2 = tn(row.plot_summary)
        return np.stack([s1,s2])
    
def transform(X):
        return np.concatenate(
            [transform1(row).reshape(1, -1) for row in X.itertuples()]
        )

def cosine_sim(arr1, arr2): 
    ### to predict the spoiler by comparing the review text with orginal plot summary
    return np.dot(arr1, arr2) / (np.linalg.norm(arr1) * np.linalg.norm(arr2))


def transform2(row):
        s1 = tn(row.review_text)
        s2 = tn(row.plot_summary)
        return cosine_sim(s1,s2)
    
    
def similarity_predict(data, threshold):   
    simPred = []
    for row in data.itertuples():
        sim = transform2(row)
        if sim >=threshold:
            simPred.append(int(1))
        else:
            simPred.append(int(0))
    return simPred

In [None]:
# By using the cosine similarity to identify which reviews having the spoiler
a = similarity_predict(x_train,0.9)
b = similarity_predict(x_test,0.9)

sim_train_pred = pd.DataFrame(a)
sim_test_pred  = pd.DataFrame(b)

print(classification_report(sim_train_pred,Y_train))
print(classification_report(sim_test_pred,Y_test))

# print("AUC Score")
# from sklearn.metrics import roc_auc_score
# print(roc_auc_score(Y_test, sim_test_pred))

## USing MLP Regressor in word embedding for the classification of spoilers

In [None]:
# transform the trainng and development data
processed_train = transform(x_train)
processed_test  = transform(x_test)

In [None]:
# multilayer perceptron clssifier 
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier( max_iter=80,hidden_layer_sizes=(400,400,400)).fit(processed_train,Y_train) # y - gold_label of df_train

test_pred_processed = clf.predict(processed_test)
print(classification_report(test_pred_processed,Y_test)) 

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(Y_test, test_pred_processed))