In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("final_games_extended.csv")

features = [
    "price",
    "ratings_count",
    "has_great_soundtrack",
    "has_story_rich",
    "has_atmospheric",
    "is_free"
]

target = "metacritic_score"

ml_df = df[features + [target]].dropna()

ml_df.head()

Unnamed: 0,price,ratings_count,has_great_soundtrack,has_story_rich,has_atmospheric,is_free,metacritic_score
0,299.0,7088,1,1,1,False,93.0
1,6247.0,5399,1,1,1,False,93.0
5,8760.0,5080,0,1,1,False,93.0
6,799.0,3395,1,1,1,False,84.0
7,2799.0,2929,1,1,1,False,86.0


In [2]:
from sklearn.model_selection import train_test_split

X = ml_df[features]
y = ml_df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [4]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

y_pred = lin_reg.predict(X_test_scaled)

In [5]:
from sklearn.metrics import r2_score, mean_squared_error

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("R² Score:", r2)
print("RMSE:", rmse)

R² Score: 0.14803546948958546
RMSE: 6.209281432925716


In [6]:
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": lin_reg.coef_
}).sort_values(by="Coefficient", ascending=False)

coef_df

Unnamed: 0,Feature,Coefficient
1,ratings_count,1.904985
3,has_story_rich,1.235952
0,price,0.117489
5,is_free,0.0
2,has_great_soundtrack,-0.07156
4,has_atmospheric,-1.850345


In [7]:
ml_df["high_score"] = (ml_df["metacritic_score"] >= 75).astype(int)

X = ml_df[features]
y = ml_df["high_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [8]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)

In [10]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8846153846153846
Precision: 0.8846153846153846
Recall: 1.0
F1 Score: 0.9387755102040817

Confusion Matrix:
[[ 0  3]
 [ 0 23]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.88      1.00      0.94        23

    accuracy                           0.88        26
   macro avg       0.44      0.50      0.47        26
weighted avg       0.78      0.88      0.83        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
