In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

In [3]:
def preprocess_text(text):
    preprocessed_text = text.lower()
    return preprocessed_text

In [4]:
def generate_ngrams(text, n):
    words = text.split()
    ngrams = zip(*[words[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [5]:
def vectorize_text(text, n, unique_ngrams):
    ngrams = generate_ngrams(text, n)
    vectorized_text = np.zeros(len(unique_ngrams))    
    for i, ngram in enumerate(unique_ngrams):
        vectorized_text[i] = int(ngram in ngrams)    
    return vectorized_text

In [6]:
import pandas as pd
import numpy as np
from sklearn.utils import resample


df = pd.read_csv("Musical_instruments_reviews.csv")

x = df.iloc[:,6].values
y = df.iloc[:,5].values

data = pd.DataFrame({'x': x, 'y': y})

balanced_data = pd.DataFrame()
for value in np.unique(y):
    subset = data[data['y'] == value]
    
    if value == 4 or value == 5:
        resampled_subset = resample(subset, replace=False, n_samples=500, random_state=42)
        balanced_data = pd.concat([balanced_data, resampled_subset])
    else:
        resampled_subset = resample(subset, replace=False, n_samples=217, random_state=42)
        balanced_data = pd.concat([balanced_data, resampled_subset])

x = balanced_data['x'].values
y = balanced_data['y'].values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
all_ngrams = [generate_ngrams(preprocess_text(text), 1) for text in X_train]
unique_ngrams = list(set([item for sublist in all_ngrams for item in sublist]))
X_train_vectorized = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams) for text in X_train])

In [8]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

In [9]:
X_test_vectorized = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams) for text in X_test])

In [10]:
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.43202416918429004


In [11]:
import numpy as np
count5 = np.count_nonzero(y == 5)
count4 = np.count_nonzero(y == 4)
count3 = np.count_nonzero(y == 3)
count2 = np.count_nonzero(y == 2)
count1 = np.count_nonzero(y == 1)
print(count1)
print(count2)
print(count3)
print(count4)
print(count5)

217
217
217
500
500


In [12]:
text = "5 stars, but I still prefer my Fender tuner"
v = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams)])
y_pred = model.predict(v)
y_pred

array([4.])

In [13]:
text = "Useless for drop tuning!"
v = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams)])
y_pred = model.predict(v)
y_pred

array([1.])

In [14]:
text = "Not bad. Not good. Not better"
v = np.array([vectorize_text(preprocess_text(text), 1, unique_ngrams)])
y_pred = model.predict(v)
y_pred

array([3.])