In [1]:
import pyspark
import random

seed = random.seed(100)

import data_preparation
import Cross_Validate

In [2]:
spark = pyspark.sql.SparkSession.builder.appName('Ops').getOrCreate()
spark.newSession()

In [5]:
interactions_file = 'downsampled_files/sampled_df_practice.csv'
users_file = 'downsampled_files/users_practice.csv'
books_file = 'downsampled_files/books_practice.csv'

data_prep = data_preparation.Prep_Data(spark,partition_value=10)

In [6]:
interactions,users,books = data_prep.Get_Data_Local(interactions_file,users_file,books_file)

interactions_trimmed = data_prep.Trim_LowNum(interactions,min_allowed=10,cut_not_read=True)

In [7]:
interactions_sampled = data_prep.DownSample(interactions_trimmed,.5)

In [8]:
train_val , test, test_to_val = data_prep.Create_TestSet(interactions_sampled,percent_train=.6)

In [None]:
train_val.write.parquet('split_files/train_val_split.parquet','overwrite')
test_to_val.write.parquet('split_files/test_to_val.parquet','overwrite')
test.write.parquet('split_files/test_split.parquet','overwrite')

# Now to Work with modeling

## What I did above was :
 - Read in goodreads genre file
 - downsampled based off user
 - randomly split into train/val/test based off user
 - pulled half of books per user from val/test back into training set.

In [3]:
train_val = spark.read.parquet('split_files/train_val_split.parquet')
test_to_val = spark.read.parquet('split_files/test_to_val.parquet')
test = spark.read.parquet('split_files/test_split.parquet')

In [4]:
hyperparam_grid = Cross_Validate.Create_Grid()

In [5]:
out = Cross_Validate.Cross_Validation(spark,train_val,test_to_val,hyperparam_grid)

0,
0,
0,
0,
0,


In [6]:
out

Unnamed: 0,rank,regParam,maxIter,CV_1,CV_2,CV_3,CV_4,CV_5
0,10.0,0.1,10.0,1.43792,1.397787,1.440197,1.461973,1.428468


# Now to access final model and save output

In [3]:
train_val = spark.read.parquet('split_files/train_val_split.parquet')
test_to_val = spark.read.parquet('split_files/test_to_val.parquet')
test = spark.read.parquet('split_files/test_split.parquet')

In [4]:
hyperparam_grid = Cross_Validate.Create_Grid(rank=[20],regParam=[.1],maxIter=[15])

In [5]:
model_out = 'practice_model.json'
Cross_Validate.Train_Final_Model(spark,train_val,test_to_val,test,hyperparam_grid,
                                 model_out,evaluation_metric='rmse',partition_value=10,random_seed=seed)



Final Model evaluation for rmse : 1.3872734361611831


In [6]:
!ls

Cross_Validate.py      derby.log              shell_setup.sh
Local_Test.ipynb       [34mdownsampled_files[m[m      [34mspark-warehouse[m[m
README.md              local_pyspark_setup.sh [34msplit_files[m[m
[34m__pycache__[m[m            [34mmetastore_db[m[m
data_preparation.py    [34mpractice_model.json[m[m
