In [None]:
# importing required packages and libs
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
import findspark
findspark.init()
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql import SQLContext, Row
from pyspark import SparkContext, SparkConf
sc = SparkContext('local','finalproj')
sqlContext = SQLContext(sc)
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, collect_list
from pyspark.sql.types import IntegerType
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# predicting values with naive bayes model
data = pd.read_csv(r'C:\Users\joshm\Downloads\Data.csv')
trainbayes = data[['Blurred Vision', 'Casual Glucose Tolerance']]
testbayes = data['Diabetic']
X_train, X_test, y_train, y_test = train_test_split(trainbayes, testbayes, test_size = 0.3, random_state = 0)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred))) #86.83

In [None]:
# building a recommendation using collaborative filtering and ALS.
dataset = sc.textFile(r'C:\Users\joshm\Downloads\Data.csv')
data = dataset.map(lambda l: (l.split(',')[0],l.split(',')[17],l.split(',')[20]))
header = data.first()
rows = data.filter(lambda line: line != header)
row_rdd = rows.map(lambda w: Row(User=w[0], Glucoselevel=w[1], Reco=w[2]))
alsdf = sqlContext.createDataFrame(row_rdd)
alsdf = alsdf.withColumn("User",alsdf["User"].cast(IntegerType())) \
    .withColumn("Glucoselevel",alsdf["Glucoselevel"].cast(IntegerType())) \
    .withColumn("Reco",alsdf["Reco"].cast(IntegerType()))

In [None]:
# train the als model and fit it and evaluate the rmse. 
train, test = alsdf.randomSplit([0.8, 0.2], seed = 43)
als = ALS(userCol="User", itemCol="Reco", ratingCol="Glucoselevel",rank =10, maxIter =15, regParam =.1, coldStartStrategy="drop", nonnegative = True, implicitPrefs = False)
modelactual = als.fit(train)
test_predictions = modelactual.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Glucoselevel",predictionCol="prediction")
RMSEactual = evaluator.evaluate(test_predictions)
print(RMSEactual) #89.55

In [None]:
# generate recommendation for diabetes patients based on set of recommendations from previous records
recommended = modelactual.recommendForAllUsers(10)
recommendsplit = recommended.withColumn("recommendations", explode("recommendations")).select("*",col("recommendations.*"))
df = recommendsplit.select("User","Reco")
dffinal= df.groupBy(col("User")).agg(collect_list(col("Reco")))
dffinal.toPandas().to_csv(r'C:\Users\joshm\Downloads\recomm.csv',index = False)