In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset
df = pd.read_csv('musical.tsv', sep='\t')

In [4]:
df

Unnamed: 0,Review,Score
0,This the second set of strap locks that I've o...,1
1,First of all I want to say I love a tube amp d...,1
2,"i only bought with the idea that a ""FULL"" vers...",0
3,"If you're like me, you probably bought this to...",1
4,"Didn't know what to expect for under $10, but ...",1
...,...,...
995,It really pains me to give anything but a 5-st...,1
996,"It's a decent unit, but stopped working comple...",0
997,I bought this cable in order to be able to run...,1
998,"Well made. Works as it should. However, seem t...",1


In [5]:
from nltk.corpus import stopwords
import string
# convert the text to lowercase
df['Review'] = df['Review'].apply(lambda x: x.lower())

# remove punctuation
punct = string.punctuation
df['Review'] = df['Review'].apply(lambda x: ''.join([char for char in x if char not in punct]))

# remove numbers
df['Review'] = df['Review'].apply(lambda x: ''.join([char for char in x if not char.isdigit()]))

# remove stopwords
stopwords = set(stopwords.words('english'))
df['Review'] = df['Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))


In [6]:
# tokenize the reviews
df['tokens'] = df['Review'].apply(lambda x: word_tokenize(x))


In [7]:
# perform stemming
stemmer = PorterStemmer()
df['stemmed_tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])


In [8]:
df

Unnamed: 0,Review,Score,tokens,stemmed_tokens
0,second set strap locks ive owned little diffic...,1,"[second, set, strap, locks, ive, owned, little...","[second, set, strap, lock, ive, own, littl, di..."
1,first want say love tube amp distortion overdr...,1,"[first, want, say, love, tube, amp, distortion...","[first, want, say, love, tube, amp, distort, o..."
2,bought idea full version behringers sequence p...,0,"[bought, idea, full, version, behringers, sequ...","[bought, idea, full, version, behring, sequenc..."
3,youre like probably bought hook xlr microphone...,1,"[youre, like, probably, bought, hook, xlr, mic...","[your, like, probabl, bought, hook, xlr, micro..."
4,didnt know expect proved worth gamblethis cabl...,1,"[didnt, know, expect, proved, worth, gamblethi...","[didnt, know, expect, prove, worth, gamblethi,..."
...,...,...,...,...
995,really pains give anything star review boss pr...,1,"[really, pains, give, anything, star, review, ...","[realli, pain, give, anyth, star, review, boss..."
996,decent unit stopped working completely months ...,0,"[decent, unit, stopped, working, completely, m...","[decent, unit, stop, work, complet, month, tri..."
997,bought cable order able run longer cable runs ...,1,"[bought, cable, order, able, run, longer, cabl...","[bought, cabl, order, abl, run, longer, cabl, ..."
998,well made works however seem getting little bi...,1,"[well, made, works, however, seem, getting, li...","[well, made, work, howev, seem, get, littl, bi..."


In [9]:
# perform lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_tokens'] = df['stemmed_tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [10]:
df.head()

Unnamed: 0,Review,Score,tokens,stemmed_tokens,lemmatized_tokens
0,second set strap locks ive owned little diffic...,1,"[second, set, strap, locks, ive, owned, little...","[second, set, strap, lock, ive, own, littl, di...","[second, set, strap, lock, ive, own, littl, di..."
1,first want say love tube amp distortion overdr...,1,"[first, want, say, love, tube, amp, distortion...","[first, want, say, love, tube, amp, distort, o...","[first, want, say, love, tube, amp, distort, o..."
2,bought idea full version behringers sequence p...,0,"[bought, idea, full, version, behringers, sequ...","[bought, idea, full, version, behring, sequenc...","[bought, idea, full, version, behring, sequenc..."
3,youre like probably bought hook xlr microphone...,1,"[youre, like, probably, bought, hook, xlr, mic...","[your, like, probabl, bought, hook, xlr, micro...","[your, like, probabl, bought, hook, xlr, micro..."
4,didnt know expect proved worth gamblethis cabl...,1,"[didnt, know, expect, proved, worth, gamblethi...","[didnt, know, expect, prove, worth, gamblethi,...","[didnt, know, expect, prove, worth, gamblethi,..."


In [11]:
# convert the tokens to string format for model training
df['lemmatized_tokens_str'] = df['lemmatized_tokens'].apply(lambda x: ' '.join(x))


In [12]:
df.head()

Unnamed: 0,Review,Score,tokens,stemmed_tokens,lemmatized_tokens,lemmatized_tokens_str
0,second set strap locks ive owned little diffic...,1,"[second, set, strap, locks, ive, owned, little...","[second, set, strap, lock, ive, own, littl, di...","[second, set, strap, lock, ive, own, littl, di...",second set strap lock ive own littl difficult ...
1,first want say love tube amp distortion overdr...,1,"[first, want, say, love, tube, amp, distortion...","[first, want, say, love, tube, amp, distort, o...","[first, want, say, love, tube, amp, distort, o...",first want say love tube amp distort overdrive...
2,bought idea full version behringers sequence p...,0,"[bought, idea, full, version, behringers, sequ...","[bought, idea, full, version, behring, sequenc...","[bought, idea, full, version, behring, sequenc...",bought idea full version behring sequenc progr...
3,youre like probably bought hook xlr microphone...,1,"[youre, like, probably, bought, hook, xlr, mic...","[your, like, probabl, bought, hook, xlr, micro...","[your, like, probabl, bought, hook, xlr, micro...",your like probabl bought hook xlr microphon di...
4,didnt know expect proved worth gamblethis cabl...,1,"[didnt, know, expect, proved, worth, gamblethi...","[didnt, know, expect, prove, worth, gamblethi,...","[didnt, know, expect, prove, worth, gamblethi,...",didnt know expect prove worth gamblethi cabl a...


In [17]:
from sklearn.metrics import classification_report,confusion_matrix

In [19]:
from sklearn.tree import DecisionTreeClassifier
# create the feature vector using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer


# convert the tokens to string format for model training
df['lemmatized_tokens_str'] = df['lemmatized_tokens'].apply(lambda x: ' '.join(x))

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_tokens_str'], df['Score'], test_size=0.2, random_state=42)

# create the feature vector using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# create the Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

# train the model
dt.fit(X_train, y_train)

# make predictions on the testing set
y_pred = dt.predict(X_test)

print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))
# generate classification report
print(classification_report(y_test, y_pred))


Confusion Matrix
[[57 29]
 [37 77]]
              precision    recall  f1-score   support

           0       0.61      0.66      0.63        86
           1       0.73      0.68      0.70       114

    accuracy                           0.67       200
   macro avg       0.67      0.67      0.67       200
weighted avg       0.67      0.67      0.67       200



In [20]:
from sklearn.naive_bayes import MultinomialNB

# convert the tokens to string format for model training
df['lemmatized_tokens_str'] = df['lemmatized_tokens'].apply(lambda x: ' '.join(x))

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_tokens_str'], df['Score'], test_size=0.2, random_state=42)

# create the feature vector using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# create the Naive Bayes model
nb = MultinomialNB()

# train the model
nb.fit(X_train, y_train)

# make predictions on the testing set
y_pred = nb.predict(X_test)
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))
# generate classification report
print(classification_report(y_test, y_pred))


Confusion Matrix
[[70 16]
 [23 91]]
              precision    recall  f1-score   support

           0       0.75      0.81      0.78        86
           1       0.85      0.80      0.82       114

    accuracy                           0.81       200
   macro avg       0.80      0.81      0.80       200
weighted avg       0.81      0.81      0.81       200

