In [1]:
# Import libraries
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
import features_utility as featutil

from contentknn import ContentKNNAlgorithm

## File Details - Light GBM Regression on Beer Context columns inc review counts

This is a Candidate for being used in an Ensemble. 
Characteristicts:
* Light GBM Regression Algorithm
* Using Beer Context columns inc ABV, Year and Review Counts
* Todo: use optimised parameters for Light GBM Regression


In [2]:
filePrefix = "A3_161_contentknn_tinkering2"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
writeSubRunFile = True
seed = databasic.get_random_seed()

In [3]:
# RowID  BeerID  ReviewerID  BeerName  BeerType  Label
# df_train = pd.read_csv(baseDataDir + 'train_500k.tsv',sep='\t',
df_train = pd.read_csv(baseDataDir + 'train.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_train.head(10)

# df_vali = pd.read_csv(baseDataDir + 'vali_500k.tsv',sep='\t',
df_vali = pd.read_csv(baseDataDir + 'val.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_vali.head(10)


Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,22,12300,2634,Rauch Ür Bock,Rauchbier,4.0
1,27,12300,5634,Rauch Ür Bock,Rauchbier,4.5
2,28,12300,3544,Rauch Ür Bock,Rauchbier,4.5
3,40,12300,6521,Rauch Ür Bock,Rauchbier,4.0
4,43,12300,10177,Rauch Ür Bock,Rauchbier,4.5
5,48,12300,2907,Rauch Ür Bock,Rauchbier,3.5
6,49,12300,1532,Rauch Ür Bock,Rauchbier,4.0
7,50,12300,3452,Rauch Ür Bock,Rauchbier,3.5
8,59,12300,6861,Rauch Ür Bock,Rauchbier,4.0
9,64,6699,6401,Caldera Pale Ale,American Pale Ale (APA),4.5


Add the Review Count columns for Reviewers and Beers to both the Train and Validation sets

In [4]:
df_train = featutil.addReviewerReviewCount(df_train)
df_train = featutil.addBeerReviewCount(df_train)

df_vali = featutil.addReviewerReviewCount(df_vali)
df_vali = featutil.addBeerReviewCount(df_vali)


In [5]:
# RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag
# df_features = pd.read_csv(baseDataDir + 'features_500k.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
df_features = pd.read_csv(baseDataDir + 'features.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
                                                                 'DayofMonth','Year','TimeOfDay','Gender',
                                                                 'Birthday','Text','Lemmatized','POS_Tag'])

df_features.head()

Unnamed: 0,RowID,BrewerID,ABV,DayofWeek,Month,DayofMonth,Year,TimeOfDay,Gender,Birthday,Text,Lemmatized,POS_Tag
0,18,1075,7.4,Mon,Jan,2,2012,15:20:04,Male,unknown,Pours a murky light brown with a 1 inch fizzy ...,pour a murky light brown with a 1 inch fizzy t...,VBZ DT JJ NN JJ IN DT CD NN JJ NN NN WDT VBZ I...
1,19,1075,7.4,Sun,Jan,1,2012,06:46:52,Male,unknown,Faint sudsy head with some with some dissipati...,faint sudsy head with some with some dissipate...,NN JJ NN IN DT IN DT VBG JJ NN . JJ JJ NN . DT...
2,20,1075,7.4,Tue,Nov,29,2011,05:51:44,Male,unknown,A new arrival to the West TN area ... Pours qu...,a new arrival to the West TN area ... pour qui...,"DT JJ NN IN DT NNP NNP NN , VBZ PDT DT NN JJR ..."
3,21,1075,7.4,Sat,Nov,5,2011,22:59:57,Male,unknown,Sampled 10/30/11 - Transferring the notes . A ...,sample 10/30/11 - transfer the note . a ruby p...,VBN CD HYPH VBG DT NNS . DT NN VBP IN DT NN NN...
4,22,1075,7.4,Tue,Nov,1,2011,20:40:21,Male,"Oct 14, 1983",This is my first rauchbier . Pours a burnt amb...,this be my first rauchbier . pour a burn amber...,DT VBZ PRP$ JJ NN . VBZ DT VBN NN . JJ NN . NN...


In [6]:
colsToUse = ["RowID", "BrewerID", "ABV", "DayofWeek", "DayofMonth", "Month", "Year", "Gender", "TimeOfDay", "Birthday"]

# Find the feature records that match the training and validation data and join them together
dfFullData = df_train.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
dfFullDataVali = df_vali.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")

dfFullData.head()

Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating,ReviewerReviewCount,BeerReviewCount,RowIDFeat,BrewerID,ABV,DayofWeek,DayofMonth,Month,Year,Gender,TimeOfDay,Birthday
0,19,12300,10635,Rauch Ür Bock,Rauchbier,4.0,200,23,37,1075,7.4,Mon,23,May,2011,Male,15:36:28,unknown
1,21,12300,6547,Rauch Ür Bock,Rauchbier,4.5,10,23,40,1075,7.4,Mon,16,May,2011,Male,00:31:46,unknown
2,23,12300,9789,Rauch Ür Bock,Rauchbier,4.5,164,23,42,1075,7.4,Sun,10,Apr,2011,unknown,12:17:26,unknown
3,24,12300,7372,Rauch Ür Bock,Rauchbier,5.0,432,23,43,1075,7.4,Wed,30,Mar,2011,Male,14:08:00,unknown
4,25,12300,1302,Rauch Ür Bock,Rauchbier,4.5,500,23,45,1075,7.4,Thu,24,Mar,2011,Male,14:07:00,"Jul 25, 1984"


In [7]:
# Remove the duplicated Row ID, also remove Beer Name at this point, we're nt using it
df_train_data = dfFullData.drop(['RowIDFeat', "BeerName"],axis=1)
df_vali_data = dfFullDataVali.drop(['RowIDFeat', "BeerName"],axis=1)

df_train_data.head()

Unnamed: 0,RowID,BeerID,ReviewerID,BeerType,rating,ReviewerReviewCount,BeerReviewCount,BrewerID,ABV,DayofWeek,DayofMonth,Month,Year,Gender,TimeOfDay,Birthday
0,19,12300,10635,Rauchbier,4.0,200,23,1075,7.4,Mon,23,May,2011,Male,15:36:28,unknown
1,21,12300,6547,Rauchbier,4.5,10,23,1075,7.4,Mon,16,May,2011,Male,00:31:46,unknown
2,23,12300,9789,Rauchbier,4.5,164,23,1075,7.4,Sun,10,Apr,2011,unknown,12:17:26,unknown
3,24,12300,7372,Rauchbier,5.0,432,23,1075,7.4,Wed,30,Mar,2011,Male,14:08:00,unknown
4,25,12300,1302,Rauchbier,4.5,500,23,1075,7.4,Thu,24,Mar,2011,Male,14:07:00,"Jul 25, 1984"


In [8]:
print(df_train_data.shape)
print(df_vali_data.shape)

(746207, 16)
(243834, 16)


In [9]:
colsToUse.append("BeerType")
colsToUse.append("ReviewerReviewCount")
colsToUse.append("BeerReviewCount")

considerCol = {}

# Initialize them all to True, use all the cols
for colName in colsToUse:
  considerCol[colName] = True

# "RowID", "BrewerID", "BeerType", "ABV", "DayofWeek", "DayofMonth", "Month", "Year", "Gender", "TimeOfDay"
# Disable any column we want

# Usually always use Year and REview Counts
# considerCol["Year"] = False
# considerCol["ReviewerReviewCount"] = False
# considerCol["BeerReviewCount"] = False

# Beer Context Columns
# considerCol["ABV"] = False
# considerCol["BrewerID"] = False
# considerCol["BeerType"] = False

# Consumer Context Columns
# considerCol["DayofWeek"] = False
# considerCol["DayofMonth"] = False
# considerCol["Month"] = False
# considerCol["TimeOfDay"] = False
# considerCol["Birthday"] = False
# considerCol["Gender"] = False


In [10]:
if considerCol["ABV"] == False:
  del df_train_data["ABV"]
  del df_vali_data["ABV"]
else:
  df_train_data = featutil.fixNullABV(df_train_data)
  df_vali_data = featutil.fixNullABV(df_vali_data)

In [11]:
if considerCol["BrewerID"] == False:
  del df_train_data["BrewerID"]
  del df_vali_data["BrewerID"]
else: 
  df_train_data, df_vali_data = dfutil.getDummiesForSplitSets(df_train_data, df_vali_data, "BrewerID")


  df_combined.columns = df_combined.columns.str.replace(" ", "").str.replace("/", "").str.replace("-", "") \


In [None]:
if considerCol["BeerType"] == False:
  del df_train_data["BeerType"]
  del df_vali_data["BeerType"]
else: 
  df_train_data, df_vali_data = dfutil.getDummiesForSplitSets(df_train_data, df_vali_data, "BeerType")

In [None]:
if considerCol["Gender"] == False:
  del df_train_data["Gender"]
  del df_vali_data["Gender"]
else: 
  df_train_data, df_vali_data = dfutil.getDummiesForSplitSets(df_train_data, df_vali_data, "Gender")

In [None]:
if considerCol["DayofWeek"] == False:
  del df_train_data["DayofWeek"]
  del df_vali_data["DayofWeek"]
else: 
  df_train_data = featutil.formatDayOfWeek(df_train_data)
  df_vali_data = featutil.formatDayOfWeek(df_vali_data)

In [None]:
if considerCol["Month"] == False:
  del df_train_data["Month"]
  del df_vali_data["Month"]
else: 
  df_train_data = featutil.formatMonth(df_train_data)
  df_vali_data = featutil.formatMonth(df_vali_data)

In [None]:
if considerCol["DayofMonth"] == False:
  del df_train_data["DayofMonth"]
  del df_vali_data["DayofMonth"]

In [None]:
if considerCol["Year"] == False:
  del df_train_data["Year"]
  del df_vali_data["Year"]

In [None]:
if considerCol["TimeOfDay"] == False:
  del df_train_data["TimeOfDay"]
  del df_vali_data["TimeOfDay"]
else: 
  df_train_data = featutil.formatTimeToSec(df_train_data)
  df_vali_data = featutil.formatTimeToSec(df_vali_data)

In [None]:
if considerCol["Birthday"] == False:
  del df_train_data["Birthday"]
  del df_vali_data["Birthday"]
else: 
  df_train_data = featutil.convertBirthdayToAge(df_train_data)
  df_vali_data = featutil.convertBirthdayToAge(df_vali_data)

In [None]:
if considerCol["ReviewerReviewCount"] == False:
  del df_train_data["ReviewerReviewCount"]
  del df_vali_data["ReviewerReviewCount"]

In [None]:
if considerCol["BeerReviewCount"] == False:
  del df_train_data["BeerReviewCount"]
  del df_vali_data["BeerReviewCount"]

In [None]:
print(df_train_data.shape)
print(df_vali_data.shape)

df_train_data.head()

(106546, 403)
(35089, 403)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,19,12300,10635,4.0,36,23,7.4,1,23,5,...,0,0,0,0,0,0,0,0,1,0
1,21,12300,6547,4.5,4,23,7.4,1,16,5,...,0,0,0,0,0,0,0,0,1,0
2,23,12300,9789,4.5,36,23,7.4,7,10,4,...,0,0,0,0,0,0,0,0,0,1
3,24,12300,7372,5.0,69,23,7.4,3,30,3,...,0,0,0,0,0,0,0,0,1,0
4,25,12300,1302,4.5,59,23,7.4,4,24,3,...,0,0,0,0,0,0,0,0,1,0


In [None]:
df_vali_data.head()

Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,22,12300,2634,4.0,4,9,7.4,6,14,5,...,0,0,0,0,0,0,0,0,0,1
1,27,12300,5634,4.5,9,9,7.4,4,10,2,...,0,0,0,0,0,0,0,0,0,1
2,28,12300,3544,4.5,40,9,7.4,5,10,12,...,0,0,0,0,0,0,0,0,0,1
3,40,12300,6521,4.0,19,9,5.5,4,27,8,...,0,0,0,0,0,0,0,0,1,0
4,43,12300,10177,4.5,2,9,5.5,1,10,8,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# free up the memory
del dfFullData
del dfFullDataVali

In [None]:
# Get all the columns
col_names = df_train_data.columns

idCols = ['RowID','BeerID','ReviewerID']
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_train_data[idCols]
dfTrainTarget = df_train_data[target_col]

dfValiIds = df_vali_data[idCols]
dfValiTarget = df_vali_data[target_col]

# This time, we need to keep the Row Id so we can use it for the cosine similarity
feature_cols =  col_names.drop(['BeerID','ReviewerID','rating'])
dfFullFeatures = df_train_data[feature_cols].append(df_vali_data[feature_cols])


In [None]:
dfFullFeatures.head()

Unnamed: 0,RowID,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,Year,TimeOfDay,Birthday,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,19,36,23,7.4,1,23,5,2011,56188,0,...,0,0,0,0,0,0,0,0,1,0
1,21,4,23,7.4,1,16,5,2011,1906,0,...,0,0,0,0,0,0,0,0,1,0
2,23,36,23,7.4,7,10,4,2011,44246,0,...,0,0,0,0,0,0,0,0,0,1
3,24,69,23,7.4,3,30,3,2011,50880,0,...,0,0,0,0,0,0,0,0,1,0
4,25,59,23,7.4,4,24,3,2011,50820,37,...,0,0,0,0,0,0,0,0,1,0


In [None]:
# Scale the data but make sure not to modify the Row ID
datacols = ["ReviewerReviewCount", "BeerReviewCount", "ABV", "DayofWeek", "DayofMonth", "Month", "Year", "TimeOfDay", "Birthday"]
columnsToIgnore = dfFullFeatures.columns.drop(datacols)

dfFullFeaturesIds = dfFullFeatures[columnsToIgnore]
dfFullFeaturesData = dfFullFeatures[datacols]

print(str(len(datacols)))
print(datacols)
dfFullFeaturesData.head()

scaler = StandardScaler()
dfFullFeaturesData = pd.DataFrame(scaler.fit_transform(dfFullFeaturesData), columns=datacols)
dfFullFeaturesData.head()

# join the ids back to the data
dfFullFeatures = pd.concat([dfFullFeaturesIds.reset_index(), dfFullFeaturesData], axis=1).drop(columns="index")
dfFullFeatures.head(10)

del dfFullFeaturesIds
del dfFullFeaturesData

Unnamed: 0,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,Year,TimeOfDay,Birthday
0,-0.218319,-0.777937,0.024917,-1.444169,0.829428,-0.463495,1.04545,0.942454,-0.744071
1,-0.789355,-0.777937,0.024917,-1.444169,0.034808,-0.463495,1.04545,-2.071688,-0.744071
2,-0.218319,-0.777937,0.024917,1.500208,-0.646295,-0.748211,1.04545,0.279345,-0.744071
3,0.370562,-0.777937,0.024917,-0.46271,1.624048,-1.032928,1.04545,0.647714,-0.744071
4,0.192113,-0.777937,0.024917,0.028019,0.942945,-1.032928,1.04545,0.644383,0.90893


In [None]:
df_vali_data[['BeerID','ReviewerID','rating']].head()

Unnamed: 0,BeerID,ReviewerID,rating
0,12300,2634,4.0
1,12300,5634,4.5
2,12300,3544,4.5
3,12300,6521,4.0
4,12300,10177,4.5


In [None]:

reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(df_train_data[['BeerID','ReviewerID',
                                    'rating']],reader)

dsetValiFeatures = Dataset.load_from_df(df_vali_data[['BeerID','ReviewerID',
                                     'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

print(type(dsetTrainFeatures))
print(type(trainsetTrainFeatures))
trainsetTrainFeatures

<class 'surprise.dataset.DatasetAutoFolds'>
<class 'surprise.trainset.Trainset'>


<surprise.trainset.Trainset at 0x13aaa607be0>

In [None]:

print(type(dsetValiFeatures))

<class 'surprise.dataset.DatasetAutoFolds'>


In [None]:

NA,valset = train_test_split(dsetValiFeatures, test_size=1.0)

# simple Tuning best params: {'bsl_options': }

algorithm = ContentKNNAlgorithm()
algorithm.setFeatures(dfFullFeatures)

# algorithm.fit_simulation(trainsetTrainFeatures)

In [None]:


model = algorithm.fit(trainsetTrainFeatures)
predictions = algorithm.test(valset)


# Score our predictions with MAE
# It is around 0.77, which means the a random guess based on the distribution of the data
# is on average within 0.77 (plus or minus) the true label.
# Not bad! You can beat it though, I'm sure :).
# Smaller MAE is the better. Good luck!
mae = accuracy.mae(predictions,verbose=True)

print("Average MAE: " + str(mae))

Computing content-based similarity matrix...
0  of  9026
  Processing thisMovieID: 10635 otherMovieID: 6547
  Processing thisMovieID: 10635 otherMovieID: 9789
  Processing thisMovieID: 10635 otherMovieID: 7372
  Processing thisMovieID: 6547 otherMovieID: 9789
  Processing thisMovieID: 6547 otherMovieID: 7372
  Processing thisMovieID: 6547 otherMovieID: 1302
  Processing thisMovieID: 9789 otherMovieID: 7372
  Processing thisMovieID: 9789 otherMovieID: 1302
  Processing thisMovieID: 9789 otherMovieID: 704
  Processing thisMovieID: 7372 otherMovieID: 1302
  Processing thisMovieID: 7372 otherMovieID: 704
  Processing thisMovieID: 7372 otherMovieID: 1747
  Processing thisMovieID: 1302 otherMovieID: 704
  Processing thisMovieID: 1302 otherMovieID: 1747
  Processing thisMovieID: 1302 otherMovieID: 9368
1000  of  9026
2000  of  9026
3000  of  9026
4000  of  9026
5000  of  9026
6000  of  9026
7000  of  9026
8000  of  9026
9000  of  9026
...done.
MAE:  0.5017
Average MAE: 0.5017023125253641


In [None]:
print(type(valset))
print(valset[0:10])


<class 'list'>
[(2048, 6317, 4.0), (299, 3212, 4.0), (1889, 5644, 3.5), (4180, 3655, 3.5), (1456, 4812, 3.5), (2790, 5424, 4.5), (6208, 4719, 4.5), (9531, 10472, 4.5), (3427, 2076, 4.0), (441, 1383, 3.5)]


In [None]:
print(type(predictions))
print(str(len(predictions)))
print(predictions[0:10])
print(predictions[0])

<class 'list'>
35089
[Prediction(uid=2048, iid=6317, r_ui=4.0, est=3.8781277570251347, details={'was_impossible': True, 'reason': 'No neighbors'}), Prediction(uid=299, iid=3212, r_ui=4.0, est=3.8781277570251347, details={'was_impossible': True, 'reason': 'No neighbors'}), Prediction(uid=1889, iid=5644, r_ui=3.5, est=3.8781277570251347, details={'was_impossible': True, 'reason': 'No neighbors'}), Prediction(uid=4180, iid=3655, r_ui=3.5, est=3.8781277570251347, details={'was_impossible': True, 'reason': 'No neighbors'}), Prediction(uid=1456, iid=4812, r_ui=3.5, est=3.8781277570251347, details={'was_impossible': True, 'reason': 'No neighbors'}), Prediction(uid=2790, iid=5424, r_ui=4.5, est=3.8781277570251347, details={'was_impossible': True, 'reason': 'No neighbors'}), Prediction(uid=6208, iid=4719, r_ui=4.5, est=3.8781277570251347, details={'was_impossible': True, 'reason': 'No neighbors'}), Prediction(uid=9531, iid=10472, r_ui=4.5, est=3.8781277570251347, details={'was_impossible': True

In [None]:
# Convert the Predictions to a dataframe so we can lookup predictions easy
lstUIds = list(map(lambda x: x.uid, predictions))
lstIIds = list(map(lambda x: x.iid, predictions))
lstTrueRatings = list(map(lambda x: x.r_ui, predictions))
lstRatingEst = list(map(lambda x: x.est, predictions))


# uid == BeerId, iid == ReviewerId, r_ui == Original Ration, est = Predicted rating
dfPredictions = pd.DataFrame({ "uid": lstUIds,"iid": lstIIds, "r_ui": lstTrueRatings, "Predict": lstRatingEst })

dfPredictions.head()
# dfPredictions[dfPredictions.uid == 3519]

Unnamed: 0,uid,iid,r_ui,Predict
0,2048,6317,4.0,3.878128
1,299,3212,4.0,3.878128
2,1889,5644,3.5,3.878128
3,4180,3655,3.5,3.878128
4,1456,4812,3.5,3.878128


In [None]:
print(dfValiIds.shape)
print(dfPredictions.shape)

(35089, 3)
(35089, 4)


In [None]:
# join the predictions to the ids, sort by rowid and write to file
dfPredictions = pd.merge(dfValiIds, dfPredictions, how="inner", left_on=["BeerID", "ReviewerID"], right_on=["uid", "iid"])
dfPredictions.head()


Unnamed: 0,RowID,BeerID,ReviewerID,uid,iid,r_ui,Predict
0,22,12300,2634,12300,2634,4.0,3.878128
1,27,12300,5634,12300,5634,4.5,3.878128
2,28,12300,3544,12300,3544,4.5,3.878128
3,40,12300,6521,12300,6521,4.0,3.878128
4,43,12300,10177,12300,10177,4.5,3.878128


Write to a subrun file

In [None]:
if writeSubRunFile:
  dfPredictions.sort_values("RowID")[["RowID", "BeerID", "ReviewerID", "Predict"]].to_csv(subrunDir + filePrefix + "_subrun.csv", index=False)

print("Average MAE: " + str(mae))
print("analyse_maes.append(" + str(mae) + ")")
print(dfPredictions.shape)
dfPredictions.sort_values("RowID").head(8)


Average MAE: 0.5017023125253641
analyse_maes.append(0.5017023125253641)
(35269, 7)


Unnamed: 0,RowID,BeerID,ReviewerID,uid,iid,r_ui,Predict
0,22,12300,2634,12300,2634,4.0,3.878128
1,27,12300,5634,12300,5634,4.5,3.878128
2,28,12300,3544,12300,3544,4.5,3.878128
3,40,12300,6521,12300,6521,4.0,3.878128
4,43,12300,10177,12300,10177,4.5,3.878128
5,48,12300,2907,12300,2907,3.5,3.878128
6,49,12300,1532,12300,1532,4.0,3.878128
7,50,12300,3452,12300,3452,3.5,3.878128


## Summary - 200k of data

First Run of Content KNN with just the numbers col came out to around 0.5 (exact value lost)

Adding Scaling: 0.501

## Full Data
