##### 1. Identify the issues with the “Review” column in the UNITENReview.csv file

In [175]:
import pandas as pd
import re
from bs4 import BeautifulSoup

file_path = "UNITENReview.csv"
df = pd.read_csv(file_path)

df = pd.DataFrame(df["Review"])  
pd.set_option('display.max_colwidth', None)
print(df)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

##### 2. Perform the necessary text pre-processing steps based on the identified issues

###### i. Convert text to lowercase

In [178]:
#Lowercase conversion
def convert_to_lowercase(text):
    return text.lower()

df["lowercased"] = df["Review"].apply(convert_to_lowercase)

#Display column content without truncation
pd.set_option('display.max_colwidth', None) #Set to NOne for unlimited width
print(df["lowercased"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

###### ii. Replace internet slang/chat words

In [180]:
# Replace internet slang/chat words
# Dictionary of slang words and their replacements
slang_dict = {
"w": "win",
"naah":"no",
}

#Function to replace slang words
def replace_slang(text):
    escaped_slang_words = [] #Empty list

    for word in slang_dict.keys():
        escaped_word = re.escape(word)
        escaped_slang_words.append(escaped_word)

    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b'

    def replace_match(match):
        slang_word = match.group(0)
        return slang_dict[slang_word.lower()]

    replaced_text = re.sub(slang_pattern, replace_match, text, flags = re.IGNORECASE)

    return replaced_text

df["slangs_replaced"] = df["lowercased"].apply(replace_slang)

pd.set_option('display.max_colwidth', None)
print(df["slangs_replaced"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

###### iii. Replace contractions

In [182]:
#Replace Contractions
contractions_dict = {
"wasn't": "was not",
"isn't": "is not",
"aren't": "are not",
"weren't": "were not",
"doesn't": "does not",
"don't": "do not",
"didn't": "did not",
"can't": "cannot",
"couldn't": "could not",
"shouldn't": "should not",
"wouldn't": "would not",
"won't": "will not",
"haven't": "have not",
"hasn't": "has not",
"hadn't": "had not",
"i'm": "i am",
"you're": "you are",
"he's": "he is",
"she's": "she is",
"it's": "it is",
"we're": "we are",
"they're": "they are",
"i've": "i have",
"you've": "you have",
"we've": "we have",
"they've": "they have",
"i'd": "i would",
"you'd": "you would",
"he'd": "he would",
"she'd": "she would",
"we'd": "we would",
"they'd": "they would",
"i'll": "i will",
"you'll": "you will",
"he'll": "he will",
"she'll": "she will",
"we'll": "we will",
"they'll": "they will",
"let's": "let us",
"that's": "that is",
"who's": "who is",
"what's": "what is",
"where's": "where is",
"when's": "when is",
"why's": "why is"
}

escaped_contractions = []

for contraction in contractions_dict.keys():
    escaped_contraction = re.escape(contraction)
    escaped_contractions.append(escaped_contraction)

joined_contractions = "|".join(escaped_contractions)

contractions_pattern = r'\b(' + joined_contractions + r')\b'

compiled_pattern = re.compile(contractions_pattern, flags = re.IGNORECASE)

def replace_contractions(text):
    text = text.replace("’", "'")  # Convert curly apostrophe to standard
    def replace_match(match):
        matched_word = match.group(0)
        lower_matched_word = matched_word.lower()
        expanded_form = contractions_dict[lower_matched_word]
        return expanded_form
    expanded_text = compiled_pattern.sub(replace_match, text)
    return expanded_text

df["contractions_replaced"] = df["slangs_replaced"].apply(replace_contractions)

pd.set_option('display.max_colwidth', None)
print(df["contractions_replaced"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

###### iv. Remove punctuations and special characters

In [184]:
import string

# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

# Apply the function to remove punctuation
df["punctuations_removed"] = df["contractions_replaced"].apply(remove_punctuation)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)
print(df["punctuations_removed"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

###### v. Fix encoding

In [186]:
!pip install ftfy



In [187]:
import ftfy

def preprocess_text(text):
    if isinstance(text, str):
        text = ftfy.fix_text(text)  # Fix encoding issues
        text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces

        # Replace slang words
        words = text.split()
        words = [slang_dict.get(word.lower(), word) for word in words]
        text = " ".join(words)

    return text

# Apply function to DataFrame column
df["processed_text"] = df["punctuations_removed"].apply(preprocess_text)

# Display processed text
print(df["processed_text"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

###### vi. Remove emoji

In [189]:
import emoji

#replace emoji with ''
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

df["emojis_removed"] = df["processed_text"].apply(remove_emojis)

#Display column content without truncation
pd.set_option('display.max_colwidth', None) #Set to None for unlimited width
print(df["emojis_removed"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

###### vii. Remove numbers

In [191]:
def remove_numbers(text):
    return re.sub(r'\d+','',text)

df["numbers_removed"] = df["emojis_removed"].apply(remove_numbers)

pd.set_option('display.max_colwidth', None)
print(df["numbers_removed"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

###### viii. Correct spelling mistakes

In [198]:
from autocorrect import Speller

spell = Speller(lang='en')

custom_words = {"uniten"}

# Function to correct spelling while keeping custom words
def correct_spelling(text):
    words = text.split()
    corrected_words = [word if word in custom_words else spell(word) for word in words]
    return " ".join(corrected_words)


df["spelling_corrected"] = df["numbers_removed"].apply(correct_spelling)

pd.set_option('display.max_colwidth', None)
print(df["spelling_corrected"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

###### ix. Remove stopwords

In [205]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = []

    for word in words:
        lower_word = word.lower()
        
        if lower_word not in stop_words:
            filtered_words.append(word)

    return " ".join(filtered_words)

df["stopwords_removed"] = df["spelling_corrected"].apply(remove_stopwords)

pd.set_option('display.max_colwidth', None)
print(df["stopwords_removed"])
            

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


###### x. Stemming

In [208]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
    if not isinstance(text, str):
        return ""

    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]  # Apply stemming
    return " ".join(stemmed_words)


df["stemmed_words"] = df["stopwords_removed"].apply(stem_text)


pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["stemmed_words"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 im happi uniten actual even peopl win
1                                                                                                                                               

###### xi. Lemmatization

In [211]:
import nltk

# Download the required resources
nltk.download('wordnet')                    # For lemmatization
nltk.download('omw-1.4')                     # WordNet lexical database
nltk.download('averaged_perceptron_tagger_eng')  # For POS tagging
nltk.download('punkt_tab')                       # For tokenization

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [213]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):  # Adjective
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):  # Verb
        return wordnet.VERB
    elif nltk_tag.startswith('N'):  # Noun
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):  # Adverb
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

# Function to lemmatize text with POS tagging
def lemmatize_text(text):
    if not isinstance(text, str):  # Ensure input is a string
        return ""

    words = word_tokenize(text)  # Tokenize text into words
    pos_tags = pos_tag(words)  # Get POS tags
    
    # Lemmatize each word with its correct POS tag
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    
    return " ".join(lemmatized_words)  # Join words back into a sentence

# Apply the function to the column
df["lemmatized"] = df["stopwords_removed"].apply(lemmatize_text)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["lemmatized"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            im happy uniten actually ev

###### xii. Tokenization

In [216]:
import nltk
from nltk.tokenize import word_tokenize

# Download tokenizer if not already available
nltk.download('punkt')

# Function to tokenize text
def tokenize_text(text):
    if not isinstance(text, str):  # Ensure the input is a string
        return []
    return word_tokenize(text)  # Tokenize text into words

# Apply tokenization to the column
df["tokenized"] = df["lemmatized"].apply(tokenize_text)

# Display column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["tokenized"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        [im, happy, uniten, actually, even, people, win]
1                                             

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### 3. Save the result in a .csv file

In [219]:
df.to_csv("UNITENReview_Processed.csv", index=False)