In [105]:
import re
import string
import nltk
import tensorflow_datasets as tfds
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from string import punctuation
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [117]:
# Load the IMDb reviews dataset
data = tfds.load('imdb_reviews', split={'train': 'train', 'test': 'test'})

In [118]:
# Convert the data into pandas DataFrame and decode bytes to string
train_df = tfds.as_dataframe(data['train'])
test_df = tfds.as_dataframe(data['test'])

train_df['text'] = train_df['text'].apply(lambda x: x.decode('utf-8'))
test_df['text'] = test_df['text'].apply(lambda x: x.decode('utf-8'))

train_df['label'] = train_df['label'].replace({0: 'negative', 1: 'positive'})
test_df['label'] = test_df['label'].replace({0: 'negative', 1: 'positive'})

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(train_df, train_df['label'], test_size=0.2, random_state=42, stratify=train_df['label'])

In [119]:
# Label encoding for the sentiment labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [109]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(20000, 2) (20000,)
(5000, 2) (5000,)


# Data Cleaning

In [122]:
# # Spellchecker
# spell = SpellChecker()
# #Lemmatizer
# lemmatizer = WordNetLemmatizer()


# def transformations(dataframe):
#     #HTML Tags removal
#     dataframe['text'] = dataframe['text'].apply(lambda words: re.sub('<.*?>','',words)) 
    
#     #Word Tokenization
#     dataframe['text'] = dataframe['text'].apply(word_tokenize)
    
#     #Lower case conversion
#     dataframe['text'] = dataframe['text'].apply(lambda words: [x.lower() for x in words])
    
#     #Punctuation removal
#     dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if not x in punctuation])
    
#     #Number removal
#     dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if not x.isdigit()])
    
#     # Spellchecker
#     dataframe['text'] = dataframe['text'].apply(lambda words: [spell.correction(x) for x in words])

#     #Stopword removal
#     dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if x not in stopwords.words('english')])
    
#     # # Frequent word removal
#     #temp = dataframe['text'].apply(lambda words: " ".join(words))
#     #freq = pd.Series(temp).value_counts()[:10]
#     #dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if x not in freq.keys()])
    
#     #Lemmatization
#     dataframe['text'] = dataframe['text'].apply(lambda words: [lemmatizer.lemmatize(x) for x in words])
    
#     # Join
#     dataframe['text'] = dataframe['text'].apply(lambda words: " ".join(words))
    
#     return dataframe

In [124]:
spell = SpellChecker()
lemmatizer = WordNetLemmatizer()

def spell_checker(text):
    return [spell.correction(word) for word in text]

def lemmatize_words(words):
    return [lemmatizer.lemmatize(x) if lemmatizer.lemmatize(x) else x for x in words]

def transformations(dataframe):
    # HTML Tags removal
    dataframe['text'] = dataframe['text'].apply(lambda words: re.sub('<.*?>', '', ' '.join(filter(None, words)))) 
    
    # Word Tokenization
    dataframe['text'] = dataframe['text'].apply(word_tokenize)
    
    # Lower case conversion
    dataframe['text'] = dataframe['text'].apply(lambda words: [x.lower() for x in words])
    
    # Punctuation removal
    dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if not x in punctuation])
    
    # Number removal
    dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if not x.isdigit()])
    
    # Spellchecker
    dataframe['text'] = dataframe['text'].apply(spell_checker)

    # Stopword removal
    dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if x not in stopwords.words('english')])
    
    # Lemmatization
    dataframe['text'] = dataframe['text'].apply(lemmatize_words)

    # Join again
    dataframe['text'] = dataframe['text'].apply(lambda words: " ".join(words))
    
    return dataframe

In [125]:
x_train = x_train[:10]
clean_data_train_data = transformations(x_train)

In [126]:
clean_data_train_data

Unnamed: 0,label,text
15072,positive,latest film spanish director gust study child ...
9987,negative,others mentioned movie similar fly version les...
24558,positive,idea describe movie also would love provide ot...
2571,positive,ah loved movie think made laugh loud dozen tim...
16059,positive,brilliant movie drawing amazing bad ended begu...
7301,positive,i've seen ton action one right top genre actio...
24251,negative,show clever basically boil original humor writ...
15600,positive,christopher lloyd funny really believable al h...
10668,positive,movie absolutely expecting highbrow intellectu...
14596,positive,movie jackie best still cant get enough watchi...
