In [2]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer
import numpy as np
import joblib
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

In [3]:
class LinearRegressionClosedForm:

    def __init__(self):
        self.weights = None

    def fit(self, X, y):

        weights=[]
        i=np.eye(len(X))
        # X = poly.fit_transform(X)
        weights = np.dot(X.T, X)
        i=np.eye(len(weights))
        weights=weights+30*i
        weights = np.dot(np.linalg.inv(weights), X.T)
        self.weights = np.dot(weights, y)
        return self.weights

    def predict(self, X):
        if self.weights is None:
            raise ValueError("Model has not been fitted yet.")
        # X = poly.transform(X)
        predictions = np.dot(X, self.weights)
        return predictions


In [4]:
class RobertaExtractor:

    def __init__(self, model_name='roberta-base',batch_size=8, device=None):
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        self.model = RobertaModel.from_pretrained(model_name)
        self.batch_size = batch_size

    def extract_features(self, texts):
        # Process in batches to save memory
        embeddings = []

        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i + self.batch_size]
            print(batch_texts)
            inputs = self.tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')

            with torch.no_grad():
                outputs = self.model(**inputs)

            # Average pooling instead of using only CLS token
            pooled_embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings.append(pooled_embeddings.cpu().numpy())
            print(f"\n{i+1} batch over")
        # Concatenate all batch embeddings
        return np.concatenate(embeddings, axis=0)

In [6]:
from google.colab import files
uploaded = files.upload()

Saving train_model_sample.csv to train_model_sample.csv


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('train_model_sample.csv')
print(df.columns)
train, test = train_test_split(df, test_size=0.2, random_state=42)

Index(['review', ' rating'], dtype='object')


In [8]:

feature_extractor = RobertaExtractor()
# if isinstance(train, pd.Series):
#     X_train = train.tolist()
# elif isinstance(train, pd.DataFrame):
#     X_train = train.iloc[:, 0].tolist()
X_train = train["review"].astype(str).tolist()
y_train = train[" rating"].values.reshape(-1,1)
X_test = test["review"].astype(str).tolist()
y_test = test[" rating"].values.reshape(-1,1)
X_train = feature_extractor.extract_features(X_train)
X_test =  feature_extractor.extract_features(X_test)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


["This is a very nice product for the cost.  I wish it kept the water hot once it turns off, because I tend to forget.  But it heats up very  nicely, fairly quickly, about 10 minutes, and shuts itself off.  That's a key feature over potting a kettle on the stove - did I mention I tend to forget?  When the water cools down too much, I turn it on again, and reheat it. This pot has convinced me that when it dies I'll be ready to pay more to get all the features I need.  You can only get a good couple of cups of tea out of this at a time.", "James lee Burke is one of thosed underrated masters of prose,forever delegated to second rung because of his genre. Heavens prisoners, the second in this series,is,in many ways, the best. Dave Robicheux, the alcoholic new Orleans cop,is out fishing when a single engine plane crashes into the lake,and everything changes.Mr. Burke's descriptions of alcoholic despair and rage are perhaps the finest,and least sentimental I have read. The violence is brutal

In [9]:
regressor = LinearRegressionClosedForm()
regressor.fit(X_train, y_train)

predictions = regressor.predict(X_test)


print("Predictions:", predictions)

Predictions: [[4.82655357]
 [4.82379514]
 [4.39969962]
 [4.74734132]]


In [10]:
feature_extractor.model.save_pretrained("./roberta_extractor_model")
feature_extractor.tokenizer.save_pretrained("./roberta_extractor_model")

joblib.dump(regressor, 'linear_regression_model.pkl')

['linear_regression_model.pkl']

In [11]:
y_test = test[" rating"].values.reshape(-1,1)
# y_test_list =  []

# for i in y_test[0]:
#   if i=="Nan":
#     y_test_list.append(0)
#   else:
#     y_test_list.append(i)
# print(y_test_list)
# y_test = np.array(y_test_list)
# y_test = y_test.reshape(-1,1)
  # y_test_flat = [int(value[0]) if value[0] else 0 for value in y_test]
# print(y_test_flat)
y_test = test[" rating"].fillna(0).values.reshape(-1,1)
y_test_flat = [int(value[0]) if not np.isnan(value[0]) else 0 for value in y_test]
# print(y_test)
y_hat_flat = [value[0] for value in predictions]
rounded_y_hat = [round(value-0.1*(np.sin((4)*value))) for value in y_hat_flat]
from sklearn.metrics import mean_squared_error, accuracy_score
# y_test_flat = [value[0] for value in y_test]
# print(y_test)
# y_test = [0 for value in y_test if value[0]=="NaN"]
print(rounded_y_hat)
mse = mean_squared_error(y_test_flat, rounded_y_hat)
print(mse)
accuracy = accuracy_score(y_test_flat, rounded_y_hat)
print(accuracy)
# print(mse)

[5, 5, 4, 5]
0.0
1.0


In [12]:
diff = 0
error = 0
for i in range(len(y_test_flat)):
   diff = y_test_flat[i] - rounded_y_hat[i]
   error += diff
# error = error/len(y_test_flat)
print(error)

0
