In [3]:
import re
import contractions # expand contractions, possibly switch to pyconstractions
import pandas as pd
import matplotlib.pyplot as plt

from nltk.tag import pos_tag
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet

Install additional libraires from README file

### Dataset Analysis

In [4]:
# Set folder path to the dataset
path = '../data/'

# Read data file into a DataFrame
raw_df = pd.read_csv(path + 'imdb_dataset.csv')
raw_df.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [5]:
# Split DataFrame into reviews and sentiment
x_raw_df = raw_df.iloc[:,0]
y_raw_df = raw_df.iloc[:,1]

# Print the number of rows and columns in reviews DataFrame
print('Shape of x_raw_df:', x_raw_df.shape)

# Print the unique classes from sentiment, and check if classes are balanced 
print('Unique classes in y_raw_df:', y_raw_df.unique())
print('Representation of each class:', y_raw_df.value_counts())

Shape of x_raw_df: (46500,)
Unique classes in y_raw_df: ['positive' 'negative']
Representation of each class: sentiment
positive    25000
negative    21500
Name: count, dtype: int64


### Define functions needed for cleaning

In [6]:
def initialize_objects(language = 'english', spell_distance = 1):
    
    # Initialize SpellChecker with a maximum Levenshtein distance of 1
    spell = SpellChecker(distance = spell_distance) # For longer words 1 is recommended, try 2 for shorter

    return spell

In [7]:
def correct_spelling(df, column_name, spell_checker):

    # Create a temporary column to store the original values for later comparison
    df['temp'] = df[column_name]

    # Apply spell checker to each word in the specified column
    df[column_name] = df[column_name].apply(lambda x: ' '.join([spell_checker.correction(word) or word for word in x.split()]))
    
    # For each row, compare each word and calculate the sum of differences
    df['differences'] = df.apply(lambda x: sum(w1.lower() != w2.lower() for w1, w2 in zip(x[column_name].split(), x['temp'].split())), axis=1)

    # Show fixed word
    #df['differences'], df['s'] = zip(*df.apply(lambda x: (sum(w1.lower() != w2.lower() for w1, w2 in zip(x[column_name].split(), x['temp'].split())), [w2 for w1, w2 in zip(x[column_name].split(), x['temp'].split()) if w1.lower() != w2.lower()]), axis=1))

    # Calculate the total sum of spelling differences across all rows
    total_differences = df['differences'].sum()

    # Drop the temporary column not to waste space
    df.drop(columns=['temp', 'differences'], inplace=True)
    
    return df, total_differences

In [8]:
def preprocess_data(df, column_name):
    
    spell = initialize_objects()

    # Make a copy of the DataFrame to avoid modifying the original
    modified_df = df.copy()

    # Function that checks spelling for every word and corrects misspelled ones
    modified_df, g = correct_spelling(modified_df, column_name, spell)
    print('\tNumber of spelling mistakes that got corrected:', g)

    return modified_df

### Apply cleaning functions and export datasets

In [9]:
# Create a new DataFrame to have original one as a backup
modified_df = pd.DataFrame()
modified_df['review'] = raw_df['review']
modified_df['cleaned_review'] = raw_df['review']

# Drop review column, and add sentiment column from the original dataset
modified_df.drop(columns = 'review', inplace = True)
modified_df['sentiment'] = y_raw_df

In [10]:
# Set folder path to the dataset and an incremental variable
path = '../data/'
 
# Pricess the data
processed_df = preprocess_data(modified_df, 'cleaned_review')
    
processed_df.head()   


	Number of spelling mistakes that got corrected: 1113765


Unnamed: 0,cleaned_review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production <br /><br />The ...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,better matter's love in the Time of money is a...,positive
4,Probably my all-time favorite movie a story of...,positive


In [None]:
# Save the processed DataFrame to CSV
processed_df.to_csv('imdb_dataset_spell_T', index = False)

In [None]:
### Dataset Analysis
# Set folder path to the dataset
path = '../data/'

# Read data file into a DataFrame
raw_df = pd.read_csv(path + 'imdb_dataset.csv')
raw_df.head(3)
# Split DataFrame into reviews and sentiment
x_raw_df = raw_df.iloc[:,0]
y_raw_df = raw_df.iloc[:,1]

# Print the number of rows and columns in reviews DataFrame
print('Shape of x_raw_df:', x_raw_df.shape)

# Print the unique classes from sentiment, and check if classes are balanced 
print('Unique classes in y_raw_df:', y_raw_df.unique())
print('Representation of each class:', y_raw_df.value_counts())
### Define functions needed for cleaning


def clean_text(df, column_name):  

    # Define regular expressions for cleaning
    url_expression, tags_expression, others_expression, apostrophe_expression, underscore_expression = r'https?://\S+', r'<.*?>', r'[^\w\s\d]', r'\'', r'_+'
        
    # Count occurances before cleaning and apply cleaning for URLs 
    count_url = count_expression(df, column_name, url_expression)
    df[column_name] = df[column_name].apply(lambda x: re.sub(url_expression, ' ', x))

    # Count occurances before cleaning and apply cleaning for tags 
    count_tags = count_expression(df, column_name, tags_expression)
    df[column_name] = df[column_name].apply(lambda x: re.sub(tags_expression, ' ', x))

    # Count occurances before cleaning and apply cleaning for non-word and non-whitespace characters
    count_other = count_expression(df, column_name, others_expression) + count_expression(df, column_name, apostrophe_expression) + count_expression(df, column_name, underscore_expression)
    df[column_name] = df[column_name].apply(lambda x: re.sub(apostrophe_expression, '', x)) # don't -> dont 
    df[column_name] = df[column_name].apply(lambda x: re.sub(underscore_expression, ' ', x))
    df[column_name] = df[column_name].apply(lambda x: re.sub(others_expression, ' ', x))
    
    return df, count_url, count_tags, count_other



def preprocess_data(df, column_name):
    
    spell = initialize_objects()

    # Make a copy of the DataFrame to avoid modifying the original
    modified_df = df.copy()

    # Function that checks spelling for every word and corrects misspelled ones
    modified_df, g = correct_spelling(modified_df, column_name, spell)
    print('\tNumber of spelling mistakes that got corrected:', g)

    return modified_df
### Apply cleaning functions and export datasets
# Create a new DataFrame to have original one as a backup
modified_df = pd.DataFrame()
modified_df['review'] = raw_df['review']
modified_df['cleaned_review'] = raw_df['review']

# Drop review column, and add sentiment column from the original dataset
modified_df.drop(columns = 'review', inplace = True)
modified_df['sentiment'] = y_raw_df
# Set folder path to the dataset and an incremental variable
path = '../data/'
 
# Pricess the data
processed_df = preprocess_data(modified_df, 'cleaned_review')
    
processed_df.head()   

# Save the processed DataFrame to CSV
processed_df.to_csv('imdb_dataset_spell_T', index = False)