In [3]:
import numpy as np
import re
import pandas as pd

In [4]:
reviews = pd.read_csv("./amazon_reviews.csv")

In [26]:
reviews_text = list(reviews["reviewText"])

reviews_text_cleaned = [reviews_text[i] for i in range(len(reviews_text)) if type(reviews_text[i]) != float]

## Step 1
Provide a list of strings - the strings can be from different reviews, headlines and documents. This segment extracts all the words in all the text data.
Then extracting the unique words from the text data.

In [102]:
all_text = " ".join(reviews_text_cleaned)
#This pattern splits a string with puntuation and spaces
pattern = r"[^?.,!:; ]+"
words = re.findall(pattern, all_text)
words_cleaned = [word.lower() for word in words]
unique_words = sorted(set(words_cleaned))
word_dict_index = {}
word_count_dict = {}
IDF = {}
for i, word in enumerate(unique_words):
    word_dict_index[word] = i
    word_count_dict[word] = 0 
    IDF[word] = 0

 

doc_matrix = np.zeros((len(reviews_text_cleaned), len(unique_words)))
tf_idf = np.zeros((len(reviews_text_cleaned), len(unique_words)))

## Step 2: Term Frequency

The followging code segment will determine the term frequency for each text data point and update the doc-term matrix

In [103]:
for i,text_doc in enumerate(reviews_text_cleaned):
    pattern = r"[^?.,!:; ]+"
    text_words = re.findall(pattern, text_doc.lower())
    for word in text_words:
        w = word_dict_index[word]
        doc_matrix[i][w] += 1


## Step 3: Inverse Document Frequency

The following code segment calculates the inverse document frequency for each word in a document.

In [104]:
for word in words_cleaned:
    word_count_dict[word] += 1


for word in words_cleaned:
    ttf = word_count_dict[word]
    D = len(reviews_text_cleaned)
    idf = np.log((D / ttf))
    IDF[word] = idf

## Step 4: TF-IDF

The following code will create a doc-term matrix with TF-IDF

In [105]:
for i,text_doc in enumerate(reviews_text_cleaned):
    pattern = r"[^?.,!:; ]+"
    text_words = re.findall(pattern, text_doc.lower())
    for word in text_words:
        w = word_dict_index[word]
        tf_idf[i][w] = IDF[word]

0.0

In [122]:
class TF_IDF:

    def __init__(self, text_data):
        self.text_data = text_data
        self.words_cleaned = []
        self.word_dict_index = {}
        self.word_count_dict = {}
        self.IDF = {}        
        self.pattern = r"[^?.,!:; ]+"
        self.unique_words = self.clean()
        
        for i, word in enumerate(self.unique_words):
            self.word_dict_index[word] = i
            self.word_count_dict[word] = 0 
            self.IDF[word] = 0
            
        self.total_word_count()
        self.doc_matrix = np.zeros((len(self.text_data), len(self.unique_words)))
        self.tf_idf = np.zeros((len(self.text_data), len(self.unique_words)))

    
    def clean(self):
        all_text = " ".join(self.text_data)
        words = re.findall(self.pattern, all_text)
        self.words_cleaned = [word.lower() for word in words]
        return sorted(set(self.words_cleaned))
    
    def total_word_count(self):
        for word in self.words_cleaned:
            self.word_count_dict[word] += 1
        
        #print(self.word_count_dict)

    
    def tf(self):
        for i,text_doc in enumerate(self.text_data):
            text_words = re.findall(self.pattern, text_doc.lower())
            for word in text_words:
                w = self.word_dict_index[word]
                self.doc_matrix[i][w] += 1
    
    def idf(self):

        for word in self.words_cleaned:
            ttf = self.word_count_dict[word]
            D = len(self.text_data)
            idf = np.log((D / ttf))
            self.IDF[word] = idf
    
    def create_tf_idf(self):
        for i,text_doc in enumerate(self.text_data):
            text_words = re.findall(self.pattern, text_doc.lower())
            for word in text_words:
                w = self.word_dict_index[word]
                self.tf_idf[i][w] = self.IDF[word]
    
    def prepare(self):

        self.tf()
        self.idf()
        self.create_tf_idf()



In [123]:
TFIDF = TF_IDF(reviews_text_cleaned)
TFIDF.prepare()
