# Content Based Filtering

## Download the new test data that includes the ground-truth scores and the ratings matrix obtained from matrix factorization

In [26]:
import pandas as pd

train = pd.read_csv('test2_new.txt', sep = '|', header=None)
train.columns = ['userId', 'trackId', 'predictor']
train = train.set_index(['userId', 'trackId'])
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,predictor
userId,trackId,Unnamed: 2_level_1
200031,30877,1
200031,8244,1
200031,130183,0
200031,198762,0
200031,34503,1


In [33]:
test = pd.read_csv('ratings.csv')
test = test.set_index(['userId', 'trackId'])
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,track_rating,album_rating,artist_rating,num_genres
userId,trackId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
200072,29894,75,73,76,16
200124,162126,9,10,11,18
200174,137908,25,40,35,16
200400,263168,71,100,67,20
200427,82634,10,85,116,18


In [41]:
test['predictor'] = train['predictor']
test = test.dropna()
test.head()

test.to_csv('ground_truth.csv')

## Spark Session

In [37]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .master('local[*]')\
        .appName('ContentFiltering')\
        .config('spark.driver.maxResultSize', '10g')\
        .config('spark.executor.memory' ,'10g')\
        .config('spark.driver.memory', '10g')\
        .getOrCreate()

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [38]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [42]:
train = spark.read.csv('ground_truth.csv', header=True)

In [43]:
train.printSchema()

root
 |-- userId: string (nullable = true)
 |-- trackId: string (nullable = true)
 |-- track_rating: string (nullable = true)
 |-- album_rating: string (nullable = true)
 |-- artist_rating: string (nullable = true)
 |-- num_genres: string (nullable = true)
 |-- predictor: string (nullable = true)



In [45]:
from pyspark.sql.types import IntegerType

train = train.withColumn('userId', train['userId'].cast(IntegerType()))
train = train.withColumn('trackId', train['trackId'].cast(IntegerType()))
train = train.withColumn('track_rating', train['track_rating'].cast(IntegerType()))
train = train.withColumn('album_rating', train['album_rating'].cast(IntegerType()))
train = train.withColumn('artist_rating', train['artist_rating'].cast(IntegerType()))
train = train.withColumn('num_genres', train['num_genres'].cast(IntegerType()))
train = train.withColumn('predictor', train['predictor'].cast(IntegerType()))

train.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- track_rating: integer (nullable = true)
 |-- album_rating: integer (nullable = true)
 |-- artist_rating: integer (nullable = true)
 |-- num_genres: integer (nullable = true)
 |-- predictor: integer (nullable = true)



In [48]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['track_rating','album_rating', 'artist_rating', 'num_genres'],
                           outputCol= 'features')

In [49]:
from pyspark.ml import Pipeline

pipeline=Pipeline(stages=[assembler])
model=pipeline.fit(train)
df=model.transform(train)
selectedCols = ['userId', 'trackId', 'track_rating', 'album_rating', 'artist_rating', 'num_genres', 'predictor']
train = train.select(selectedCols)

df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- track_rating: integer (nullable = true)
 |-- album_rating: integer (nullable = true)
 |-- artist_rating: integer (nullable = true)
 |-- num_genres: integer (nullable = true)
 |-- predictor: integer (nullable = true)
 |-- features: vector (nullable = true)



In [50]:
train_1, test_1 = train.randomSplit([0.8, 0.2], seed=2018)
print('Training Dataset Count: '+str(train_1.count()))
print('Test Dataset Count: '+str(test_1.count()))

Training Dataset Count: 4849
Test Dataset Count: 1151


In [51]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(labelCol = 'predictor',maxIter=10, regParam=0.1)
lsvcModel = lsvc.fit(df)
predictions = lsvcModel.transform(df)
predictions.show(5)

+------+-------+------------+------------+-------------+----------+---------+--------------------+--------------------+----------+
|userId|trackId|track_rating|album_rating|artist_rating|num_genres|predictor|            features|       rawPrediction|prediction|
+------+-------+------------+------------+-------------+----------+---------+--------------------+--------------------+----------+
|200124| 162126|           9|          10|           11|        18|        0|[9.0,10.0,11.0,18.0]|[0.91329781098420...|       0.0|
|200400| 263168|          71|         100|           67|        20|        0|[71.0,100.0,67.0,...|[-0.4889375612679...|       1.0|
|200497|  41473|          87|          78|            0|        20|        1|[87.0,78.0,0.0,20.0]|[0.50229833326494...|       0.0|
|200561|  22468|          54|          54|           59|        19|        1|[54.0,54.0,59.0,1...|[-0.0899038323214...|       1.0|
|200795| 133020|         135|         104|          108|        19|        0|[135.0

In [52]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='predictor')
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions)))

Test Area Under ROC: 0.5894368333333334
