In [1]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re

In [2]:
# Function to preprocess data from text file
def preprocess_data_from_file(filepath):
    with open(filepath, "r") as file:
        data = file.read()
    lines = data.strip().split("\n")
    labels = []
    texts = []
    for line in lines:
        label, text = line.split(" ", 1)
        label = int(label.split("__label__")[1])
        labels.append(label)
        texts.append(text)
    return pd.DataFrame({"label": labels, "text": texts})

In [3]:
# Load training data from file
train_filepath = "train.3270.txt"
df_train = preprocess_data_from_file(train_filepath)

In [4]:
df_train.head(10)

Unnamed: 0,label,text
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."
5,2,an absolute masterpiece: I am quite sure any o...
6,1,"Buyer beware: This is a self-published book, a..."
7,2,Glorious story: I loved Whisper of the wicked ...
8,2,A FIVE STAR BOOK: I just finished reading Whis...
9,2,Whispers of the Wicked Saints: This was a easy...


In [5]:
# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\W", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [6]:
df_train["text"] = df_train["text"].apply(preprocess_text)

In [7]:
df_train.head(10)

Unnamed: 0,label,text
0,2,stuning even for the non gamer this sound trac...
1,2,the best soundtrack ever to anything i m readi...
2,2,amazing this soundtrack is my favorite music o...
3,2,excellent soundtrack i truly like this soundtr...
4,2,remember pull your jaw off the floor after hea...
5,2,an absolute masterpiece i am quite sure any of...
6,1,buyer beware this is a self published book and...
7,2,glorious story i loved whisper of the wicked s...
8,2,a five star book i just finished reading whisp...
9,2,whispers of the wicked saints this was a easy ...


In [8]:
# Split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    df_train["text"], df_train["label"], test_size=0.2, random_state=42
)

In [9]:
# Feature engineering
vectorizer = TfidfVectorizer(stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

In [10]:
# Show the first text converted to numerical form
first_text = X_train.iloc[0]
first_text_tfidf = vectorizer.transform([first_text])
print(f"First text: {first_text}")
print(f"First text TF-IDF features: {first_text_tfidf}")

First text: great for basic midi understanding and setup i am a student at musicians institute in la i wanted to learn more about midi and bought this book it has old pictures and couple sections are out of date but if your are able to tell the difference between todays computers used in music you ll be fine great explanation very easy discription how midi messages are sent how to use them and how to put together a midi system all in all a great book helped me a lot
First text TF-IDF features:   (0, 14256)	0.10186907481988854
  (0, 13954)	0.08799881891314211
  (0, 13952)	0.08178245560792077
  (0, 13750)	0.1286936060995336
  (0, 13323)	0.16294260633169919
  (0, 13037)	0.10186907481988854
  (0, 12617)	0.1382866533085504
  (0, 11635)	0.16294260633169919
  (0, 11584)	0.12273047894358569
  (0, 11519)	0.1432946228855991
  (0, 9662)	0.11802375973680114
  (0, 9072)	0.0853024422756955
  (0, 8687)	0.1398350476018426
  (0, 8682)	0.08488293818111739
  (0, 8377)	0.6718023036349915
  (0, 8338)	0.158

In [11]:
# Get the feature names (terms) from the TF-IDF vectorizer
feature_names = vectorizer.get_feature_names_out()

# Get the term corresponding to the index 14256
term = feature_names[14256]
print(f"Term corresponding to index 14256: {term}")

Term corresponding to index 14256: wanted


In [12]:
# Display the terms with their corresponding TF-IDF scores for the first text
print("Terms and TF-IDF scores for the first text:")
for index, score in zip(first_text_tfidf.indices, first_text_tfidf.data):
    term = feature_names[index]
    print(f"({index}, {term})\t{score}")

Terms and TF-IDF scores for the first text:
(14256, wanted)	0.10186907481988854
(13954, used)	0.08799881891314211
(13952, use)	0.08178245560792077
(13750, understanding)	0.1286936060995336
(13323, todays)	0.16294260633169919
(13037, tell)	0.10186907481988854
(12617, student)	0.1382866533085504
(11635, setup)	0.16294260633169919
(11584, sent)	0.12273047894358569
(11519, sections)	0.1432946228855991
(9662, pictures)	0.11802375973680114
(9072, old)	0.0853024422756955
(8687, musicians)	0.1398350476018426
(8682, music)	0.08488293818111739
(8377, midi)	0.6718023036349915
(8338, messages)	0.1588507972505638
(7894, lot)	0.09034572706949075
(7808, ll)	0.09143188171228957
(7591, learn)	0.1131374317345689
(7443, la)	0.1382866533085504
(6928, institute)	0.16795057590874787
(6275, helped)	0.14738643196673448
(5941, great)	0.17497286268116974
(5221, fine)	0.10436529123994356
(4901, explanation)	0.1588507972505638
(4358, easy)	0.09548662585418019
(3939, discription)	0.1835067502737126
(3822, differen

In [13]:
# Model selection and training
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [14]:
# Model evaluation on validation set
y_val_pred = model.predict(X_val_tfidf)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.8180428134556575
Validation Classification Report:
               precision    recall  f1-score   support

           1       0.80      0.87      0.83       342
           2       0.84      0.76      0.80       312

    accuracy                           0.82       654
   macro avg       0.82      0.82      0.82       654
weighted avg       0.82      0.82      0.82       654



In [15]:
# Load test data from file
test_filepath = "test.135.txt"
df_test = preprocess_data_from_file(test_filepath)

# Apply text preprocessing
df_test["text"] = df_test["text"].apply(preprocess_text)

# Feature engineering
X_test_tfidf = vectorizer.transform(df_test["text"])

# Model evaluation on test set
y_test_pred = model.predict(X_test_tfidf)
y_test_actual = df_test["label"].to_numpy()
print("Test Accuracy:", accuracy_score(y_test_actual, y_test_pred))
print(
    "Test Classification Report:\n",
    classification_report(df_test["label"], y_test_pred),
)

Test Accuracy: 0.7407407407407407
Test Classification Report:
               precision    recall  f1-score   support

           1       0.68      0.78      0.73        60
           2       0.80      0.71      0.75        75

    accuracy                           0.74       135
   macro avg       0.74      0.74      0.74       135
weighted avg       0.75      0.74      0.74       135



In [16]:
# Function to predict sentiment for a custom input text
def predict_custom_text(text):
    text = preprocess_text(text)
    X = vectorizer.transform([text])
    prediction = model.predict(X)
    return prediction[0]


# Example custom input text
custom_text = "I absolutely love this product! It's fantastic and works great."
predicted_label = predict_custom_text(custom_text)
print(f"Custom Text Prediction: {predicted_label}")

Custom Text Prediction: 2
