In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

## These libraries are not used

# import matplotlib.pyplot as plt
# %matplotlib inline

In [None]:
df = pd.read_csv("AllRatingsCleaned.csv")
print(df.shape)
df.sample(5)

In [None]:
## We use this function to round floating numbers to integers 
## To whatever number place we desire
def roundUpToMultiple(number, multiple):
    num = number + (multiple - 1)
    return int(num - (num % multiple))

In [None]:
## Need to significantly reduce data for processing purposes.

## In normal circumstances one would not normally reduce data
## Due to limited processing power on the researchers' computer
## This is needed
## So we will only be using the manufacturers/suppliers
## That have been rated the most

## Find the 99% Quantile
quant = .99
## And round this quantile to the nearest 100
roundLimit = 100

In [None]:
print("Original Number of Unique Manufacturers")
print(df['Manufacturer_ID'].nunique())

In [None]:
## This is the cutoff limit for manufacturers
## We only want manufacturers that appear more than this number of times
manuCut = roundUpToMultiple(df.Manufacturer_ID.value_counts().quantile(quant),
                            roundLimit)
manuCut

In [None]:
print("Original Number of Unique Suppliers")
print(df['Supplier_ID'].nunique())

In [None]:
## This is the cutoff limit for suppliers
## We only want suppliers that appear more than this number of times
suppCut = roundUpToMultiple(df.Supplier_ID.value_counts().quantile(quant),
                            roundLimit)
suppCut

In [None]:
## Create a new dataframe with
## Only the manufacturers/suppliers that appear more than our threshhold

cutdf = df[(df.Manufacturer_ID.isin(df.Manufacturer_ID.value_counts()[df.Manufacturer_ID.value_counts()>manuCut].index))&
   (df.Supplier_ID.isin(df.Supplier_ID.value_counts()[df.Supplier_ID.value_counts()>suppCut].index))]

In [None]:
cutdf.sample(10)

In [None]:
print("We retain {}% of original data".format(100*len(cutdf)/len(df)))

In [None]:
print("Total amount of data is {} rows/ratings".format(len(cutdf)))

In [None]:
print("New total Number of Unique Manufacturers")
print(cutdf['Manufacturer_ID'].nunique())

In [None]:
print("New Total Number of Unique Suppliers")
print(cutdf['Supplier_ID'].nunique())

In [None]:
## In order to randomly split the data for training/testing purposes
## We give each row a 70% chance of being in our training data
## And a 30% chance of being in our testing data
mask = np.random.rand(len(cutdf)) < 0.7

In [None]:
## Transform the data into a pivot table and save it
## We will show in the next notebook what this looks like

## This table is our training matrix
pt = pd.pivot_table(cutdf[mask],
               values = 'Rating', 
               index = 'Manufacturer_ID', 
               columns = 'Supplier_ID')
## Save the data as a csv
pt.to_csv("TrainMatrix.csv")
pt.sample(10)

In [None]:
## This table is our testing table
## In other words, what we will test against
tr = cutdf[~mask]
## Save the data as a csv
tr.to_csv("TestValues.csv",index=False)
tr.sample(10)