In [3]:
import os
import string
import pickle
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove punctuations
    tokens = [token for token in tokens if token not in string.punctuation]

    # Remove blank space tokens
    tokens = [token for token in tokens if token.isalnum()]

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    return lemmatized_tokens



In [4]:
import pandas as pd
import numpy as np

df=pd.read_csv('A2_Data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Product_id   1000 non-null   int64 
 1   Image        1000 non-null   object
 2   Review Text  999 non-null    object
dtypes: int64(1), object(2)
memory usage: 23.6+ KB


In [5]:

preprocessed=[]
df['Review Text']=df['Review Text'].astype(str)

print(preprocess_text(df['Review Text'][0]))
for i in range(len(df)):
    preprocessed.append(preprocess_text(df['Review Text'][i]))

['love', 'vintag', 'spring', 'vintag', 'strat', 'good', 'tension', 'great', 'stabil', 'float', 'bridg', 'want', 'spring', 'way', 'go']


In [6]:
DocumentFreq = {}

for i in preprocessed:
    tokens = i
    for w in tokens:
        try:
            DocumentFreq[w]=DocumentFreq[w]+1
        except:
            DocumentFreq[w] = 1

vocab=[x for x in DocumentFreq]
print(vocab[:20])


['love', 'vintag', 'spring', 'strat', 'good', 'tension', 'great', 'stabil', 'float', 'bridg', 'want', 'way', 'go', 'work', 'guitar', 'bench', 'mat', 'rug', 'enough', 'abus']


In [7]:
print(len(vocab))

4442


In [8]:
tf_idf={}
for i in range(len(preprocessed)):
    tokens = preprocessed[i]
    word_count=len(tokens)
    for w in tokens:
        term_freq = tokens.count(w)/word_count
        idf=np.log(len(preprocessed)+1/DocumentFreq[w]+1)
        tf_idf[(i,w)]=term_freq*idf


In [9]:
import pickle
with open('tf_idf.pkl', 'wb') as f:
    pickle.dump(tf_idf, f)

In [10]:
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [11]:
with open('DocumentFreq.pkl', 'wb') as f:
    pickle.dump(DocumentFreq, f)

In [12]:
print(df['Review Text'][758])

I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.


In [13]:
print(len(tf_idf))

31972
