In [1]:
import polars as pl
import numpy as np
import sqlite3
import json
from datetime import datetime
import matplotlib.pyplot as plt
import string
manual_seed = 23

np.random.seed(manual_seed)
pl.set_random_seed(manual_seed)

# Preprocessing

If necessary, adapt sqlite columns to VARCHAR by:
```
ALTER TABLE product ADD COLUMN new_price VARCHAR;
UPDATE product SET new_price = price;
ALTER TABLE product DROP COLUMN price;
ALTER TABLE product RENAME COLUMN new_price TO price;
```
```
ALTER TABLE review ADD COLUMN new_found_funny VARCHAR;
UPDATE review SET new_found_funny = found_funny;
ALTER TABLE review DROP COLUMN found_funny;
ALTER TABLE review RENAME COLUMN new_found_funny TO found_funny;
```

## Selecting Features

## Reading

In [2]:
# db_path = '../dbs/db_micro2.sqlite3'
db_path = '../dbs/db2.sqlite3'
connection_string = 'sqlite://' + db_path
df = pl.read_database_uri(
    '''SELECT product_id, text AS review_text, recommended, found_helpful, found_funny 
    FROM review LEFT JOIN product ON product_id = product.id''',
    connection_string
)

### Formatting data

#### Convert to integers

In [3]:
# convert to integers
df = df.with_columns(
    # cast features to minimal viable types
    pl.col("found_funny").cast(pl.UInt16, strict=False).fill_null(strategy="zero"),
    pl.col("found_awarding").cast(pl.UInt16, strict=False).fill_null(strategy="zero"),
    pl.col("found_helpful").cast(pl.UInt16, strict=False).fill_null(strategy="zero"),
    pl.col("recommended").cast(pl.Int8)
)

#### Choosing and calculating regression metric

We are trying to predict how funny, helpful, and awarding a review is. We have insight into this because people on Steam vote for reviews in each category. The easiest metric we could use is to try to predict how many people found a review funny. The problem is, that some reviews have more views than others, so it is only logical that those viewed less have fewer votes. Unfortunately, we don't have information about the number of views of a review, so we have to find another way to consider this metric.

We could do this by normalizing each metric by product. We would have values between 0 and 1, where 0 would indicate no votes for that review and 1 would tell us that this is the most upvoted product review. This metric assumes, that all reviews of specific products had equal opportunity to be upvoted. This is probably related to numerous other factors, like which comments are highlighted by Steam, how many people were active when that review was written etc., but it still gives us some insight into how well a review was written for a specific product. The downside is that products with few views are treated equally as products with many views. This is problematic because it is much harder to write the best (or close to the best) review when there are more reviews. Values in less popular products would therefore be much higher than others.

So far we proposed two metrics, one that evaluates reviews overall, ignoring that some may be less viewed than others, and the second that acknowledges this fact but skews less viewed votes upwards. Is it possible to get a better metric that is a compromise of the two? To do this I propose a metric that is calculated using information about how upvoted a review was to others of the same product, and all other reviews. It should regard both values equally, so it is calculated as their sum, divided by two.

In [4]:
def clip_column(df: pl.DataFrame, column_name: str, quantile: float=0.999, new_column_name: str=None) -> (pl.DataFrame, float):
    """
    When a value in a specified column falls outside the specified quantile, make it equal to the largest value in the specified quantile.
    This is used to clip big outliers for 
    """
    if new_column_name is None:
        new_column_name = column_name
    cutoff_value = df.select(column_name).quantile(0.999)
    return pl.when(pl.col(column_name) > cutoff_value).then(cutoff_value).otherwise(pl.col(column_name)).alias(new_column_name), cutoff_value

In [5]:
found_funny_expr, found_funny_cutoff_value = clip_column(df, 'found_funny', new_column_name='found_funny_cutoff')
found_awarding_expr, found_awarding_cutoff_value = clip_column(df, 'found_awarding', new_column_name='found_awarding_cutoff')
found_helpful_expr, found_helpful_cutoff_value = clip_column(df, 'found_helpful', new_column_name='found_helpful_cutoff')

In [6]:
# create normalized metrics
df = df.with_columns(
    found_funny_expr,
    found_awarding_expr,
    found_helpful_expr
)
df = df.with_columns(
    (
        (
            (pl.col("found_funny_cutoff") / pl.col("found_funny_cutoff").max()) + 
            (pl.col("found_funny_cutoff") / pl.col("found_funny_cutoff").max()).over("product_id")
        ) / 2).fill_nan(0.0).alias("found_funny"),
    (
        (
            (pl.col("found_helpful_cutoff") / pl.col("found_helpful_cutoff").max()) + 
            (pl.col("found_helpful_cutoff") / pl.col("found_helpful_cutoff").max()).over("product_id")
        ) / 2).fill_nan(0.0).alias("found_helpful"),
    (
        (
            (pl.col("found_awarding_cutoff") / pl.col("found_awarding_cutoff").max()) + 
            (pl.col("found_awarding_cutoff") / pl.col("found_awarding_cutoff").max()).over("product_id")
        ) / 2).fill_nan(0.0).alias("found_awarding")
)
df = df.drop(["found_funny_cutoff", "found_helpful_cutoff", "found_awarding_cutoff"])

# Investigating review_text

In [14]:
print(list(string.ascii_lowercase) + list(string.ascii_uppercase))
i = 0
for review in df.select(['review_text']).sample(10).iter_rows():
    print(f'"{review[0]}"')
    #print(review[0].contains(list(string.ascii_lowercase) + list(string.ascii_uppercase)))
    i += 1
    if i == 10:
        break

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
"Mr.Drippy got the drip sheeeeeeeeeesh"
"useful"
"A fun yet short semi-rogue like game with a pixel art style. Your goal is to traverse the world and defeat 5 bosses, all the while inventing new tools and upgrading yourself to become stronger and learning more about the world. I was able to 100% the game in 7 hours, though there is an extra difficulty you can unlock. Controller support is available, and I found the controls very smooth and nice when playing. There are a few things the game doesn't teach you in the tutorial, like that you have a map you can open by hitting select, but otherwise I found it enjoyable while playing, even if it was a bit on the easier side. You can definitely make significant progress in the game in 2

In [8]:
# Filter out reviews that don't contain any letters.
df = df.filter(pl.col('review_text').str.contains_any(list(string.ascii_lowercase) + list(string.ascii_uppercase)))

In [9]:
# split into train test dev
df_split = df.select("product_id").unique("product_id").sort("product_id")
df_split = df_split.with_columns(
    pl.lit(np.random.rand(df_split.height)).alias("split")
)
df_split = df_split.with_columns(
    pl.when(pl.col("split") < 0.8).then(pl.lit("train")).otherwise(pl.when(pl.col("split") < 0.9).then(pl.lit("test")).otherwise(pl.lit("dev"))).alias("split")
)
df_dict = df.join(df_split, on="product_id", how="left").partition_by("split", as_dict=True, include_key=False)
# df_split["split"].value_counts()

In [10]:
print(df_dict[("train",)].shape[0])
print(df_dict[("test",)].shape[0])
print(df_dict[("dev",)].shape[0])

33904649
4614750
4643683


In [11]:
def write_parquet(df_dict, filename):
    df_dict[("train",)].write_parquet(filename + '_train.parquet')
    df_dict[("test",)].write_parquet(filename + '_test.parquet')
    df_dict[("dev",)].write_parquet(filename + '_dev.parquet')

write_parquet(df_dict, "data/complete")


In [12]:
df_dict_500k = {}
df_dict_500k[("train",)] = df_dict[("train",)].sample(500000, seed=manual_seed, shuffle=True)
df_dict_500k[("test",)] = df_dict[("test",)].sample(50000, seed=manual_seed, shuffle=True)
df_dict_500k[("dev",)] = df_dict[("dev",)].sample(50000, seed=manual_seed, shuffle=True)

write_parquet(df_dict_500k, "data/500k_50k")