In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sts
%matplotlib inline

In [4]:
df_ll = pd.read_csv('processed_data/df_ll.csv')

  mask |= (ar1 == a)


In [8]:
df = df_ll[['LOAN_ID',"lender_v"]]

In [10]:
from scipy.sparse import csr_matrix

In [12]:
df_sparse = csr_matrix( (np.ones(df.values.shape[0]), (df.values[:,0], df.values[:,1])), shape=(1500000,1500000))

In [13]:
from scipy.sparse.linalg import svds

In [14]:
u,sigma,vt = svds(df_sparse,k=25)

In [15]:
sigma = np.diag(sigma)

In [16]:
temp = np.dot(u,sigma)

In [None]:
pred = np.dot(temp,vt)

In [1]:
import pyspark as ps

spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("kiva") 
        .getOrCreate()
        )

sc = spark.sparkContext

In [2]:
spark

In [3]:
df_ll = spark.read.csv('processed_data/df_ll.csv',header=True)

In [14]:
df_ll = df_ll.select('LOAN_ID','lender_v')

In [19]:
from pyspark.sql.functions import lit

df_with_one = df_ll.withColumn("value", lit(1))
df_with_one.show(4)

+-------+--------+-----+
|LOAN_ID|lender_v|value|
+-------+--------+-----+
| 483693|  970524|    1|
| 483693| 1153379|    1|
| 483693|  187221|    1|
| 483693|  758772|    1|
+-------+--------+-----+
only showing top 4 rows



In [20]:
splits = df_with_one.randomSplit([1.0,9.0],17)
# 10% samples

In [8]:
splits[0].count()

2830799

In [9]:
splits[1].count()

25463132

In [21]:
t = splits[0].randomSplit([1.0,2.0],17) #train, test split

In [10]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [22]:
t[1].show(3) #train

+-------+--------+-----+
|LOAN_ID|lender_v|value|
+-------+--------+-----+
|1000000|  255491|    1|
|1000006| 1159371|    1|
|1000017|  775140|    1|
+-------+--------+-----+
only showing top 3 rows



In [23]:
rank = 10
numIterations = 5
model = ALS.trainImplicit(t[1], rank, numIterations, alpha=0.01)

In [32]:
model.productFeatures().first()

(0,
 array('d', [-1.8726382944311837e-10, -1.730004611788516e-10, -2.6146798615123146e-10, -6.371863592313787e-10, 4.50704445986716e-10, 2.093182432938434e-10, 7.73351174765402e-11, -3.7226768889730977e-10, -6.953218001370942e-10, 1.1660955534509299e-09]))

In [33]:
model.userFeatures().first()

(84,
 array('d', [2.671254151209723e-06, -1.2119568459922903e-08, -8.463875929010101e-07, -1.4354569088936842e-07, -1.5237415027513634e-06, -1.1088316114182817e-06, 8.937672646425199e-07, -7.351070507866098e-08, 1.0461875490364037e-06, 2.9223979254311416e-06]))

In [44]:
# For Product X, Find N Users to Sell To
model.recommendUsers(112954,10)

[Rating(user=941036, product=112954, rating=2.659423800386871e-15),
 Rating(user=501194, product=112954, rating=2.6081652927416005e-15),
 Rating(user=1077154, product=112954, rating=2.393020905961452e-15),
 Rating(user=874786, product=112954, rating=2.374128624134902e-15),
 Rating(user=837805, product=112954, rating=2.371981138878187e-15),
 Rating(user=925163, product=112954, rating=2.1392698182780584e-15),
 Rating(user=972429, product=112954, rating=2.136227011000985e-15),
 Rating(user=1025596, product=112954, rating=2.135050048114183e-15),
 Rating(user=834877, product=112954, rating=2.1134449376024417e-15),
 Rating(user=1218796, product=112954, rating=2.1112412629897128e-15)]

In [45]:
# For User Y Find N Products to Promote
model.recommendProducts(14992,10)

[Rating(user=14992, product=1360582, rating=5.142088133744607e-16),
 Rating(user=14992, product=791788, rating=3.235397078396169e-16),
 Rating(user=14992, product=54638, rating=3.0808369593335793e-16),
 Rating(user=14992, product=857265, rating=2.73068524456809e-16),
 Rating(user=14992, product=1096855, rating=2.0723323555015285e-16),
 Rating(user=14992, product=76239, rating=1.7299699053147019e-16),
 Rating(user=14992, product=2444, rating=1.6381133137718346e-16),
 Rating(user=14992, product=1111447, rating=1.4162793683799574e-16),
 Rating(user=14992, product=47704, rating=9.687867321608612e-17),
 Rating(user=14992, product=502744, rating=9.274000354370959e-17)]

In [46]:
#Predict Single Product for Single User
model.predict(14992, 1107145)

2.2297890475678726e-19

In [47]:
# Predict Multi Users and Multi Products
# Pre-Processing
pred_input = t[1].rdd.map(lambda x:(x[0],x[1]))   

AttributeError: 'DataFrame' object has no attribute 'map'

In [26]:
predictions = model.predictAll(t[1].rdd.map(lambda r: (r[0], r[1])))

In [50]:
true_reorg = t[1].rdd.map(lambda x:((x[0],x[1]), x[2]))
pred_reorg = predictions.map(lambda x:((x[0],x[1]), x[2]))

In [51]:
#Do the actual join
true_pred = true_reorg.join(pred_reorg)

In [52]:
#Need to be able to square root the Mean-Squared Error
from math import sqrt

MSE = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()
RMSE = sqrt(MSE)

In [53]:
RMSE

0.0

In [54]:
test_pred = model.predictAll(t[0].rdd.map(lambda r: (r[0], r[1])))

In [55]:
true_reorg = t[0].rdd.map(lambda x:((x[0],x[1]), x[2]))
pred_reorg = test_pred.map(lambda x:((x[0],x[1]), x[2]))

In [56]:
#Do the actual join
true_pred = true_reorg.join(pred_reorg)

In [57]:
MSE = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()
RMSE = sqrt(MSE)

In [58]:
RMSE

0.0

In [30]:
l = predictions.collect()

In [31]:
l

[Rating(user=264, product=282635, rating=2.4240346421661748e-14),
 Rating(user=408, product=925685, rating=7.734549834449234e-13),
 Rating(user=412, product=634017, rating=1.8903666297182487e-09),
 Rating(user=452, product=727340, rating=1.3795504060949386e-13),
 Rating(user=552, product=1235640, rating=5.351062876925796e-09),
 Rating(user=672, product=388422, rating=-1.9427723303990512e-13),
 Rating(user=732, product=646660, rating=6.115739330683204e-11),
 Rating(user=780, product=363761, rating=2.6709320383980227e-12),
 Rating(user=788, product=1037160, rating=-4.7336660724951436e-21),
 Rating(user=796, product=493889, rating=-1.0225892337943623e-19),
 Rating(user=800, product=603101, rating=1.5118740519238202e-14),
 Rating(user=852, product=319678, rating=2.5681823797963837e-09),
 Rating(user=1068, product=1142299, rating=3.00376300306468e-22),
 Rating(user=1088, product=172256, rating=3.879997794746629e-18),
 Rating(user=1092, product=639705, rating=1.2246624706329371e-13),
 Rating