## Linear Regression Analysis: Predicting Ratings and Evaluating Accuracy


In [1]:
import pymongo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [2]:
# Load the features from 'features.csv'
features = pd.read_csv('features.csv').dropna(subset=['rating'])

# Create a panda Series 
series = pd.Series(features['feature_vector'])

# Split each line into multiple values
split_values = series.str.split(',')

# Split the Series into columns
X = split_values.apply(lambda x: pd.Series([item.strip('[]') for item in x]))

# Convert X to float values
X = X.astype(float)

Y = features['rating']

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict ratings for the test set
y_pred = model.predict(X_test)

# Convert the predicted ratings to discrete classes
y_pred_classes = np.round(y_pred)  # Round the predictions to the nearest integer

# Calculate accuracy with a variation od +/-1
accuracy = sum(abs(y_test - y_pred_classes) <= 1) / len(y_test)

#accuracy = np.sum(y_pred_classes == y_test) / len(y_test)
print("Custom Accuracy:", accuracy*100 , "%")

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_classes))
print("RMSE:", rmse)

Custom Accuracy: 49.79329894877751 %
RMSE: 2.272781998632427
