## Magazine category IM matrix dump
* Load
* transform to IM matrix with multiple ratings in a list
* replace cells with mean of ratings
* all empty cells with NaN
* sample 20% of occupied cells
* dump both train and test set

In [1]:
import pandas as pd
import numpy as np
import dill

In [2]:
folder_path = "file_server/dataset/ratings"
files = ["Magazine_Subscriptions.csv"]

In [3]:
df = pd.read_csv(folder_path+"/Magazine_Subscriptions.csv", header=None, names=['u_id', 'p_id', 'rating'])

In [4]:
def get_iteraction_matrix_with_mean_ratings(df):
    k = pd.get_dummies(df['p_id'])
    k.values[k!=0] = df['rating']
    k.replace(0, np.nan, inplace=True)
    k = pd.concat([df['u_id'], k], axis=1)
    k = k.groupby('u_id').mean()
    return np.array(k.index), np.array(k.columns), k.values

In [5]:
df = df.iloc[:20000, :]
print(len(df['p_id'].unique()), len(df['u_id'].unique()))

18241 115


In [6]:
users, items, interaction_matrix = get_iteraction_matrix_with_mean_ratings(df=df)

In [7]:
users.shape, items.shape, interaction_matrix.shape

((115,), (18241,), (115, 18241))

In [8]:
## Drop products which have no ratings

def get_products_with_no_ratings(s):
    col_indices = list()
    for i in range(s.shape[1]):
        if np.count_nonzero(~np.isnan(s[:,i]))==0:
            col_indices.append(i)
    print(col_indices)
    return col_indices

col_to_remove = get_products_with_no_ratings(interaction_matrix)

interaction_matrix = np.delete(interaction_matrix, col_to_remove, axis=1)
items = np.delete(items, col_to_remove, axis=0)

print(users.shape, items.shape, interaction_matrix.shape)

[]
(115,) (18241,) (115, 18241)


In [163]:
# sparsity
values_present = np.count_nonzero(~np.isnan(interaction_matrix))
print(values_present)
print(values_present*100/(interaction_matrix.shape[0]*interaction_matrix.shape[1]))

19954
0.9512255001275197


In [164]:
def count_non_na_in_row(row, threshold):
    n_ratings = np.count_nonzero(~np.isnan(row))
    if  n_ratings >= threshold:
        return n_ratings, 1
    return n_ratings, 0

index_counter = np.empty(0)
ratings_per_user = np.empty(0)
for row in interaction_matrix:
    n_ratings, th_cleared = count_non_na_in_row(row, 5)
    index_counter = np.append(index_counter, th_cleared)
    ratings_per_user = np.append(ratings_per_user, n_ratings)


In [165]:
index_counter, ratings_per_user.mean()

(array([1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
        1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 173.51304347826087)

In [166]:
# create train by copying interaction matrix
train = np.copy(interaction_matrix)

# create a nan filled test like train
test = np.empty_like(train)
test[:] = np.nan

In [167]:
import random, math

for i in range(train.shape[0]):
    
    # sample only if index_counter values are 1, we dont want to sample and edit rows below the threshold
    if index_counter[i] == 1:
        
        # get indices of non missing values in each row
        non_nan_map = ~np.isnan(train[i])
        non_nan_indices = [ind for ind,_ in enumerate(non_nan_map) if _]

        # randomly sample 20% of non missing indices
        sample = random.sample(non_nan_indices, math.ceil(0.2*len(non_nan_indices)))

        # set these sampled indices ka value to test, and replace them with NaN in train
        for k in sample:
            test[i,k] = train[i,k]
            train[i,k] = np.nan

In [168]:
# slice test and users as per index_counter, which indicates the 
# map of users having more than "threshold" ratings

users_of_interest = users[index_counter==1]
test = test[index_counter==1, :]

In [169]:
# should result in 107 products, checking...

print(train.shape, test.shape)
np.count_nonzero(~np.isnan(interaction_matrix[5])), np.count_nonzero(~np.isnan(train[5])), np.count_nonzero(~np.isnan(test[5]))

(115, 18241) (107, 18241)


(184, 147, 37)

In [170]:
dump_location = "file_server/processed_data/iteration1/magazine_subscription_subset_115X18241/"

# dump training and test files

with open(dump_location+"train_matrix.pkl", "wb")as fp:
    dill.dump(train, fp)
    
with open(dump_location+"train_users.pkl", "wb")as fp:
    dill.dump(users, fp)

with open(dump_location+"items.pkl", "wb")as fp:
    dill.dump(items, fp)

with open(dump_location+"test_matrix.pkl", "wb")as fp:
    dill.dump(test, fp)
    
with open(dump_location+"test_users.pkl", "wb")as fp:
    dill.dump(users_of_interest, fp)

## Scribble space

In [142]:
np.count_nonzero(~np.isnan(interaction_matrix[5])), np.count_nonzero(~np.isnan(train[5])), np.count_nonzero(~np.isnan(test[5]))

(184, 147, 37)

In [144]:
3/11

0.2727272727272727

In [113]:
non_nan_map = ~np.isnan(train[2])
non_nan_indices = [ind for ind,_ in enumerate(non_nan_map) if _]
print(non_nan_indices)

import random, math
random.sample(non_nan_indices, math.ceil(0.2*len(non_nan_indices)))

[3937, 4395, 4479, 4939, 5240, 7593, 8117, 9283, 12531, 16511, 17000, 17381, 17863]


[4395, 17863, 5240]

array(['B00005N7NQ', 'B00005N7O6', 'B00005N7O9', 'B00005N7OA',
       'B00005N7OC', 'B00005N7OD', 'B00005N7OF', 'B00005N7OP',
       'B00005N7OU', 'B00005N7OV', 'B00005N7P0', 'B00005N7PG',
       'B00005N7PH', 'B00005N7PI', 'B00005N7PN', 'B00005N7PS',
       'B00005N7PT', 'B00005N7Q1', 'B00005N7Q2', 'B00005N7Q5',
       'B00005N7QA', 'B00005N7QC', 'B00005N7QD', 'B00005N7QE',
       'B00005N7QG', 'B00005N7QH', 'B00005N7QI', 'B00005N7QJ',
       'B00005N7QL', 'B00005N7QN', 'B00005N7QO', 'B00005N7QS',
       'B00005N7QW', 'B00005N7R0', 'B00005N7R5', 'B00005N7R6',
       'B00005N7R7', 'B00005N7RA', 'B00005N7RD', 'B00005N7RF',
       'B00005N7RJ', 'B00005N7RO', 'B00005N7RP', 'B00005N7RT',
       'B00005N7RV', 'B00005N7S2', 'B00005N7S4', 'B00005N7S5',
       'B00005N7S8', 'B00005N7SA', 'B00005N7SB', 'B00005N7SC',
       'B00005N7SD', 'B00005N7SG', 'B00005N7SH', 'B00005N7SL',
       'B00005N7SM', 'B00005N7SN', 'B00005N7SS', 'B00005N7SV',
       'B00005N7T3', 'B00005N7T5', 'B00005N7T6', 'B0000

In [81]:
temp_var = interaction_matrix[:10, :100]
temp_var.shape

(10, 100)

In [85]:
indices = np.random.permutation(temp_var.shape[0])
training_idx, test_idx = indices[:int(0.8*len(temp_var))], indices[int(0.8*len(temp_var)):]
training, test = temp_var[training_idx,:], temp_var[test_idx,:]

In [86]:
training.shape, test.shape

((8, 100), (2, 100))

In [88]:
users_train, users_test = users[training_idx], users[test_idx]
users_train

array(['B00005N7O6', 'B00005N7OJ', 'B00005N7OC', 'B00005N7OP',
       'B00005N7O9', 'B00005N7OA', 'B00005N7NQ', 'B00005N7OF'],
      dtype=object)

In [6]:
temp_df = df.head(10).copy()
temp_df['p_id'][0] = "AOSFI0JEYU4XM"
temp_df['rating'][0] = 4
temp_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,u_id,p_id,rating
0,B00005N7P0,AOSFI0JEYU4XM,4.0
1,B00005N7P0,AOSFI0JEYU4XM,5.0
2,B00005N7OJ,A3JPFWKS83R49V,3.0
3,B00005N7OJ,A19FKU6JZQ2ECJ,5.0
4,B00005N7P0,A25MDGOMZ2GALN,5.0
5,B00005N7P0,A3XT9XXWXFMJ1,3.0
6,B00005N7P0,A3ERU005ES1IHT,5.0
7,B00005N7P0,AC2278WPK3EU,5.0
8,B00005N7P0,A3QRR8PSCBI07C,4.0
9,B00005N7P0,A5QQOOZJOVPSF,4.0


In [72]:

# k = pd.get_dummies(temp_df, columns=['p_id'], prefix='', prefix_sep='')

# names = list(k.columns).pop(1)
# print(names)
# k.groupby(names).mean()
# k
# f = pd.get_dummies(k['p_id'])
# f.values[f!=0] = k['rating']
# f.values[f==0] = np.nan
# f

# # k = pd.get_dummies(temp_df, columns=['p_id'])
k = pd.get_dummies(temp_df['p_id'])
k.values[k!=0] = temp_df['rating']
k.replace(0, np.nan, inplace=True)
k
k = pd.concat([temp_df['u_id'], k], axis=1)
# # names = k.columns #- ['ratings']
# # names
j = k.groupby('u_id').mean()
np.array(j.index), np.array(j.columns), j.values, j

(array(['B00005N7OJ', 'B00005N7P0'], dtype=object),
 array(['A19FKU6JZQ2ECJ', 'A25MDGOMZ2GALN', 'A3ERU005ES1IHT',
        'A3JPFWKS83R49V', 'A3QRR8PSCBI07C', 'A3XT9XXWXFMJ1',
        'A5QQOOZJOVPSF', 'AC2278WPK3EU', 'AOSFI0JEYU4XM'], dtype=object),
 array([[5. , nan, nan, 3. , nan, nan, nan, nan, nan],
        [nan, 5. , 5. , nan, 4. , 3. , 4. , 5. , 4.5]]),
             A19FKU6JZQ2ECJ  A25MDGOMZ2GALN  A3ERU005ES1IHT  A3JPFWKS83R49V  \
 u_id                                                                         
 B00005N7OJ             5.0             NaN             NaN             3.0   
 B00005N7P0             NaN             5.0             5.0             NaN   
 
             A3QRR8PSCBI07C  A3XT9XXWXFMJ1  A5QQOOZJOVPSF  AC2278WPK3EU  \
 u_id                                                                     
 B00005N7OJ             NaN            NaN            NaN           NaN   
 B00005N7P0             4.0            3.0            4.0           5.0   
 
             AOSFI0

In [19]:
pd.get_dummies(temp_df, columns=['p_id']).groupby(['u_id'], as_index=True).mean()

Unnamed: 0_level_0,rating,p_id_A19FKU6JZQ2ECJ,p_id_A25MDGOMZ2GALN,p_id_A3ERU005ES1IHT,p_id_A3JPFWKS83R49V,p_id_A3QRR8PSCBI07C,p_id_A3XT9XXWXFMJ1,p_id_A5QQOOZJOVPSF,p_id_AC2278WPK3EU,p_id_AOSFI0JEYU4XM
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B00005N7OJ,4.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
B00005N7P0,4.375,0.0,0.125,0.125,0.0,0.125,0.125,0.125,0.125,0.25


In [None]:
import pandas as pd
import numpy as np
import torch
import databricks.koalas as ks
from pyspark.sql import SparkSession

In [None]:
sc.stop()

In [None]:
# import findspark
# findspark.init()

import pyspark
from pyspark import SparkConf, SparkContext

def create_spark_context() -> SparkContext:
    spark_conf = SparkConf().setMaster("spark://172.16.10.134:7077").setAppName("Spark_processing")
    return SparkContext.getOrCreate(spark_conf) 

 

sc = create_spark_context()

In [None]:
from pyspark import SparkContext, SparkConf

 

def create_spark_context() -> SparkContext:
    spark_conf = SparkConf()\
        .setMaster("spark://172.16.10.130:7077")\
        .setAppName("Spark_Init_Test")\
        .set("spark.executor.memory", "12g")
        #.set('spark.rpc.message.maxSize', 300)\
    return SparkContext.getOrCreate(spark_conf) 

 

sc = create_spark_context();

In [None]:
sqlcontext = pyspark.SQLContext(sc)

In [None]:
spark = SparkSession.builder.getOrCreate()
spark

In [None]:
folder_path = "file_server/dataset/ratings/"
#     files = os.listdir(folder_path)
#     print(files)
files = ['Magazine_Subscriptions.csv']

df = pd.read_csv(folder_path+"/"+files[0], header=None, names=['u_id', 'p_id', 'rating'])
sdf = sqlcontext.createDataFrame(df)

In [None]:
sdf.show()

In [None]:
ks.set_option('compute.default_index_type', 'distributed-sequence')

kdf = sdf.to_koalas()
kdf

In [None]:
temp_df = kdf.head(10)
temp_df

In [None]:
l = temp_df.groupby(['u_id','p_id']).size().reset_index().rename(columns={0:'count'})
print(len(ks.unique(temp_df['u_id'])), len(ks.unique(temp_df['p_id'])))
print(len(l))