# Technical prep

In [0]:
import pandas as pd
import io
import requests
import numpy as np
from scipy.sparse import lil_matrix

from tqdm.autonotebook import tqdm
from tqdm import trange



# Data prep

In [0]:
url="https://raw.githubusercontent.com/MengtingWan/marketBias/master/data/df_electronics.csv"
s=requests.get(url).content

In [0]:
raw_data = pd.read_csv(io.StringIO(s.decode('utf-8')), sep=",")

In [0]:
raw_data.head(3)

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
0,0,0,5.0,1999-06-13,Female,Portable Audio & Video,,1999,,0
1,0,1,5.0,1999-06-14,Female,Portable Audio & Video,,1999,,0
2,0,2,3.0,1999-06-17,Female,Portable Audio & Video,,1999,,0


In [0]:
raw_data.brand.unique()

array([nan, 'HP', 'Philips', 'Polaroid', 'Panasonic', 'JVC', 'Fujifilm',
       'Nikon', 'Kodak', 'Sony', 'Canon', 'Kensington', 'Pyle', 'Olympus',
       'Toshiba', 'Logitech', 'Etre Jeune', 'Linksys', 'Vivitar',
       'Sennheiser', 'Apple', 'Samsung', 'EldHus', 'Bose', 'Archos',
       'Garmin', 'Jabra', 'Gary Fong', 'ViewSonic', 'Savage', 'Uniden',
       'ebasy', 'Generic', 'JLAB', 'Skullcandy', 'TaoTronics', 'Neewer',
       'Koolertron', 'DURAGADGET', 'iRULU', 'Tiamat', 'DBPOWER', 'Fintie',
       'Plemo', 'EINCAR', 'Cooper Cases', 'LSS', 'Mpow', 'XShields',
       'IRULU', 'Funlux'], dtype=object)

In [0]:
raw_data.model_attr.fillna("missing", inplace=True)
raw_data.user_attr.fillna("missing", inplace=True)
raw_data.brand.fillna("missing", inplace=True)

In [0]:
tab = pd.crosstab(raw_data['model_attr'], raw_data['user_attr'], margins=True)

In [0]:
tab

user_attr,Female,Male,All
model_attr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,34259,31587,65846
Female&Male,26478,24930,51408
Male,25963,30907,56870
All,86700,87424,174124


In [0]:
max_uid = raw_data.user_id.max()
max_uid

1157632

Testing if data consist consecutive id numbers

In [0]:
unique_uid = pd.Series(raw_data.user_id.unique())
expected_ids = pd.Series(range(max_uid + 1))

pd.util.testing.assert_series_equal(unique_uid, expected_ids)

In [0]:
max_item_id = raw_data.item_id.max()
unique_iids = pd.Series(raw_data.item_id.unique())
expected_item_ids = pd.Series(range(max_item_id+1))

pd.util.testing.assert_series_equal(expected_item_ids, unique_iids)

In [0]:
user_item_matrix = lil_matrix((max_uid+1, max_item_id+1), dtype=np.int8)
for i, row in raw_data.iterrows():
    uidx = row['user_id']
    iidx = row['item_id']
    rating = row['rating']
    user_item_matrix[uidx, iidx] = rating
    if i % 100000 == 0:
        print(f"Processed: {i / float(raw_data.shape[0])}%")


Processed: 0.0%
Processed: 0.07734227203752028%
Processed: 0.15468454407504056%
Processed: 0.23202681611256085%
Processed: 0.30936908815008113%
Processed: 0.38671136018760144%
Processed: 0.4640536322251217%
Processed: 0.541395904262642%
Processed: 0.6187381763001623%
Processed: 0.6960804483376826%
Processed: 0.7734227203752029%
Processed: 0.8507649924127231%
Processed: 0.9281072644502434%


In [0]:
print("done")

done


<1157633x9560 sparse matrix of type '<class 'numpy.int8'>'
	with 1292954 stored elements in List of Lists format>