# DATA CLEANING

In [None]:
import pandas as pd
import re # Package for regular expressions
import nltk # Main python package for natural language processing

In [None]:
# Let us download the dataset from the course repository
imdb = pd.read_csv('https://datasciencebocconi.github.io/Data/IMDB_small.csv')
imdb.shape # Size of the dataset

In [None]:
# Display the first 5 rows of this dataset
imdb.head(5)

In [None]:
review = imdb.iloc[8, 0]
review

In [None]:
from bs4 import BeautifulSoup # Load the package

# Removes the <br /> and other HTML tags
def remove_html(data):
    data = BeautifulSoup(data)
    return data.getText()

In [None]:
def remove_abb(review):
    replacements = {
       "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "could've": "could have",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "gonna": "going to",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'll": "he will",
        "he's": "he is",
        "how'd": "how did",
        "how'll": "how will",
        "how's": "how is",
        "I'd": "I would",
        "I'll": "I will",
        "I'm": "I am",
        "I've": "I have",
        "isn't": "is not",
        "it'd": "it would",
        "it'll": "it will",
        "it's": "it is",
        "Its" : "It is",
        "let's": "let us",
        "mightn't": "might not",
        "mustn't": "must not",
        "shan't": "shall not",
        "she'd": "she would",
        "she'll": "she will",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "that's": "that is",
        "there's": "there is",
        "they'd": "they would",
        "wanna" : "want to",
        "We're" : "We are"
    }
    for key, value in replacements.items():
        review = re.sub(r"{}".format(key), value, review)
    return review

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()

In [None]:
# 1st round of pre-processing
def basic_cleaning(review):
    review = remove_html
    review = remove_abb(review) # Remove abbreviations
    return review

# 2nd round of Pre-processing
def advanced_cleaning(review):
  
  # Basic cleaning (HTML + symbols)
    review = basic_cleaning(review)
  
  # Normalization
    review = review.lower()

  # Tokenization
    review_tokens = nltk.word_tokenize(review)
  
  # Special symbols and punctuation
    review_tokens = [words for words in review_tokens if words.isalpha()] 
  
  # Filtering
    review_tokens = [words for words in review_tokens if words not in stopwords.words('english')]
  
  # Stemming
    review_tokens = [nltk.SnowballStemmer("english").stem(words) for words in review_tokens]
  
  # Conversion to a single string
    review = detokenizer.detokenize(review_tokens)
    return review

In [None]:
# Original document
imdb.iloc[4,0]

In [None]:
# Basic cleaning
basic_cleaning(imdb.iloc[4,0])

In [None]:
# After stemming
advanced_cleaning(imdb.iloc[4,0])

In [None]:
# This could take a while
imdb['review_clean'] = imdb['review'].apply(lambda z: basic_cleaning(z))
imdb['review_token'] = imdb['review'].apply(lambda z: advanced_cleaning(z))

imdb.head(2)

In [None]:
# Put everything into a single string
words  = ' '.join(imdb['review_token'])
# Create a global tokenization
tokens = nltk.word_tokenize(words)

# Conversion to "text"
text = nltk.Text(tokens)
# Compute the most common words
fdist = nltk.FreqDist(text)

# Use pandas for organizing and displaying the results
df_words = pd.DataFrame(list(fdist.items()), columns = ["Word","Frequency"])
# Order words from the most frequent
df_words = df_words.sort_values(by = "Frequency", ascending = False)

# Dimension of the dataset
df_words.shape

In [None]:
df_words.head(10)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Creation of a TDM with p = 500 words
vectorizer = CountVectorizer(max_features = 500)
X = vectorizer.fit_transform(imdb['review_token'])
word_names = list(vectorizer.get_feature_names_out())

# Conversion to dataframe
X = pd.DataFrame(X.toarray())
# Renaming columns according to words
X.columns = word_names

In [None]:
X.head(8)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Creation of a TDM TF-IDF with p = 500 words
vectorizer = TfidfVectorizer(max_features = 500)
X = vectorizer.fit_transform(imdb['review_token'])
word_names = list(vectorizer.get_feature_names_out())

# Conversion to dataframe
X = pd.DataFrame(X.toarray())
# Renaming columns according to words
X.columns = word_names

In [None]:
X.head(8)