In [59]:
!pip install pandas 
!pip install scikit-learn==1.6.1
!pip install torch



In [60]:
import os 
import pandas as pd
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [61]:
# Get the current directory
current_directory = os.getcwd()

print(f"current directory: {current_directory}")

current directory: /Users/hayleyluo/Desktop/Github/ML-kaggle-on-the-snow-ski-area-reviews/kaggle-ski-area-review


In [62]:
# import the dataset
file = f"{current_directory}/datasets/bronze/on_the_snow_ski_area_reviews.csv"
df = pd.read_csv(file, header=0)
df

Unnamed: 0.1,Unnamed: 0,State,Ski Area,Reviewer Name,Review Star Rating (out of 5),Review Text,Review Missing,Review Date datetime,Review Text Cleaned
0,0.0,california,squaw-valley-usa,philip sayles,3.0,I'm glad our family experienced Squaw but I wo...,1.0,2019-12-31,im glad family experienced squaw wont return n...
1,1.0,california,squaw-valley-usa,mateonelson,4.0,"I went skiing today 5/22, granite chief was am...",1.0,2019-05-22,went skiing today granite chief amazing new sn...
2,2.0,california,squaw-valley-usa,anonymous_user,2.0,We had a horrible experience on our family ski...,1.0,2019-03-07,horrible experience family ski vacation plenty...
3,3.0,california,squaw-valley-usa,Ivan Cazares,3.0,This is the first year I ski Squaw. I've been ...,1.0,2019-03-05,first year ski squaw ive couple sunny powder d...
4,4.0,california,squaw-valley-usa,welzbob,3.0,"Both Squaw and Alpine have incredible terrain,...",1.0,2019-02-26,squaw alpine incredible terrain impressive sno...
...,...,...,...,...,...,...,...,...,...
18381,18257.0,new-hampshire,whaleback-mountain,Matthijs,1.0,Looks like they only made snow on one trail. Y...,1.0,2017-03-18,look like made snow one trail need spend make ...
18382,18258.0,new-hampshire,whaleback-mountain,Dolly,3.0,Looks cute from highway........ but not enough...,1.0,2017-02-25,look cute highway enough trail open dirty lodg...
18383,18259.0,new-hampshire,whaleback-mountain,Roger,3.0,Place has incredible potential. Could be best ...,1.0,2017-02-11,place incredible potential could best place ar...
18384,18260.0,new-hampshire,whaleback-mountain,Arthur,2.0,Great exposure from interstate......... but lo...,1.0,2017-02-10,great exposure interstate look tired need face...


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18386 entries, 0 to 18385
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     18262 non-null  float64
 1   State                          18386 non-null  object 
 2   Ski Area                       18386 non-null  object 
 3   Reviewer Name                  18384 non-null  object 
 4   Review Star Rating (out of 5)  18262 non-null  float64
 5   Review Text                    18250 non-null  object 
 6   Review Missing                 18138 non-null  float64
 7   Review Date datetime           18138 non-null  object 
 8   Review Text Cleaned            18074 non-null  object 
dtypes: float64(3), object(6)
memory usage: 1.3+ MB


In [64]:
# Preprocess labels

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df["Review Text"].fillna("").tolist()).toarray()
df["Review Star Rating"] = df["Review Star Rating (out of 5)"]-1
y = df["Review Star Rating"].fillna(3).values

In [65]:
print("Vocabulary size:", len(vectorizer.vocabulary_))
print("TF-IDF matrix shape:", X.shape)
print("vectorizer.get_feature_names_out():", vectorizer.get_feature_names_out())


Vocabulary size: 5000
TF-IDF matrix shape: (18386, 5000)
vectorizer.get_feature_names_out(): ['00' '000' '00am' ... 'zones' 'zoo' 'zooming']


In [66]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)

# Define MLP model
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim):
        super(SentimentClassifier, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 5)  # 5 classes: positive & negative
        )

    def forward(self, x):
        return self.net(x)

# Initialize model
model = SentimentClassifier(input_dim=X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [67]:
# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_preds = torch.argmax(val_outputs, dim=1)
        accuracy = (val_preds == y_val).float().mean()
    print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item():.4f} - Val Accuracy: {accuracy:.4f}")

Epoch 1/10 - Loss: 1.5903 - Val Accuracy: 0.4704
Epoch 2/10 - Loss: 1.5860 - Val Accuracy: 0.4704
Epoch 3/10 - Loss: 1.5814 - Val Accuracy: 0.4704
Epoch 4/10 - Loss: 1.5764 - Val Accuracy: 0.4704
Epoch 5/10 - Loss: 1.5709 - Val Accuracy: 0.4704
Epoch 6/10 - Loss: 1.5648 - Val Accuracy: 0.4704
Epoch 7/10 - Loss: 1.5581 - Val Accuracy: 0.4704
Epoch 8/10 - Loss: 1.5504 - Val Accuracy: 0.4704
Epoch 9/10 - Loss: 1.5419 - Val Accuracy: 0.4704
Epoch 10/10 - Loss: 1.5321 - Val Accuracy: 0.4704


In [None]:
# Analysis:
