In [1]:
import sys
from nltk import data
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from IMDB.app_entity.artifacts_entity import DataIngestionArtifact, DataTransformationArtifact
from IMDB.app_entity.config_entity import DataTransformationConfig
from IMDB.app_logger import App_Logger
from IMDB.app_config.configuration import Configuration
from IMDB.app_database.mongoDB import MongoDB
from IMDB.app_exception.exception import App_Exception
from IMDB.app_util.util import save_object, save_numpy_array_data , load_data_from_mongodb
from IMDB.app_constants import *

In [2]:
logging = App_Logger(__name__)

In [3]:
RAW_FILE_PATH = '/home/pk/Desktop/MachineLearning/project/IMBD-Movie-Review/IMDB/app_artifact/stage00_data_ingestion/ingested_data/Train.csv'

In [4]:
train_df = pd.read_csv(RAW_FILE_PATH)

In [5]:
train_df.head()

Unnamed: 0,review,sentiment
0,I caught this little gem totally by accident b...,positive
1,I can't believe that I let myself into this mo...,negative
2,*spoiler alert!* it just gets to me the nerve ...,negative
3,If there's one thing I've learnt from watching...,negative
4,"I remember when this was in theaters, reviews ...",negative


In [6]:
class FeatureGenerator(BaseEstimator, TransformerMixin):

    def __init__(self):
        try:
            pass
        except Exception as e:
            raise App_Exception(e, sys) from e

    def fit(self, X, y=None):
        pass 
        return self

    def transform(self, X, y=None):
        try:
            logging.info("Transforming data")
            data = X.copy()
            review_column = 'review'
            data = data.apply(self.strip_html)
            data = data.apply(self.remove_between_square_brackets)
            data = data.apply(self.remove_special_characters)
            data= data.apply(self.simple_stemmer)
            data= data.apply(self.remove_stopwords)
            return data
        except Exception as e:
            raise App_Exception(e, sys) from e
    
    def strip_html(self , text):
        try:
            soup = BeautifulSoup(text, 'html.parser')
            return soup.get_text()
        except Exception as e:
            raise App_Exception(e, sys) from e
        
    def remove_between_square_brackets(self,text):
        try:
            return re.sub('\[[^]]*\]', '', text)
        except Exception as e:
            raise App_Exception(e, sys) from e
        
    def remove_special_characters(self ,text, remove_digits=True):
        try:
            pattern=r'[^a-zA-z0-9\s]'
            text=re.sub(pattern,'',text)
            return text
        except Exception as e:
            raise App_Exception(e, sys) from e
        
    def simple_stemmer(self,text):
        try : 
            ps=nltk.porter.PorterStemmer()
            text= ' '.join([ps.stem(word) for word in text.split()])
            return text
        except Exception as e:
            raise App_Exception(e, sys) from e
        
 
    def remove_stopwords(self,text, is_lower_case=False):
        tokenizer=ToktokTokenizer()
        stopword_list=set(nltk.corpus.stopwords.words('english'))
        tokens = tokenizer.tokenize(text)
        tokens = [token.strip() for token in tokens]
        if is_lower_case:
            filtered_tokens = [token for token in tokens if token not in stopword_list]
        else:
            filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
        filtered_text = ' '.join(filtered_tokens)    
        return filtered_text

In [17]:
X = train_df.drop(['sentiment'], axis=1)
y = train_df['sentiment']

In [18]:
label = LabelBinarizer()

In [19]:
label.fit(y)

In [20]:
review_column = train_df.columns[0]

In [21]:
review_column

'review'

In [22]:
feature_pipeline_cv = Pipeline( steps = [
        ('feature_generator' , FeatureGenerator()),
        ('CountVectorizer', CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3)))])
feature_pipeline_tf = Pipeline( steps = [
        ('feature_generator' , FeatureGenerator()),
        ('TfidfVectorizer', TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3)))])


In [23]:
preprocessing = ColumnTransformer([('feature_generator_cv', feature_pipeline_cv, review_column),
                    ('feature_generator_tv', feature_pipeline_tf, review_column) ])

In [24]:
target = label.transform(y)

In [25]:
processed_x = preprocessing.fit_transform(X)

Transforming data
Transforming data


In [26]:
X_train, X_test, y_train, y_test = train_test_split(processed_x, target, test_size=0.2, random_state=42)

In [27]:
xgboost = XGBClassifier(gpu_id=0, tree_method='gpu_hist' , predictor='cpu_predictor')

In [28]:
xgboost.fit(X_train, y_train)

In [29]:
xgboost.score(X_test, y_test)

0.498

In [30]:
xgboost.score(X_train, y_train)

0.5005

In [31]:
from sklearn.metrics import confusion_matrix

In [32]:
y_pred = xgboost.predict(X_test)

In [33]:
confusion_mt = confusion_matrix(y_test, y_pred)

In [34]:
confusion_mt

array([[3984,    0],
       [4016,    0]])

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

In [37]:
rf.fit(X_train, y_train)

  rf.fit(X_train, y_train)


In [38]:
rf.score(X_test, y_test)

0.498

In [39]:
rf.score(X_train, y_train)

0.5040625

In [40]:
confusion_mt_rf = confusion_matrix(y_test, rf.predict(X_test))

In [41]:
confusion_mt_rf

array([[3984,    0],
       [4016,    0]])