In [1]:
import numpy as np
import pandas as pd

In [2]:
from itertools import count
from collections import defaultdict
from scipy.sparse import csr

def vectorize_dic(dic, ix=None, p=None):
    """ 
    Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature) 
    
    parameters:
    -----------
    dic -- dictionary of feature lists. Keys are the name of features
    ix -- index generator (default None)
    p -- dimension of featrure space (number of columns in the sparse matrix) (default None)
    """
    if (ix == None):
        ix = defaultdict(count(0).next)
        
    n = len(dic.values()[0]) # num samples
    g = len(dic.keys()) # num groups
    nz = n * g # number of non-zeros

    col_ix = np.empty(nz, dtype=int)
    
    i = 0
    for k, lis in dic.iteritems():
        # append index el with k in order to prevet mapping different columns with same id to same index
        col_ix[i::g] = [ix[str(el) + str(k)] for el in lis]
        i += 1
        
    row_ix = np.repeat(np.arange(0, n), g)
    data = np.ones(nz)
    
    if (p == None):
        p = len(ix)
        
    ixx = np.where(col_ix < p)

    return csr.csr_matrix((data[ixx],(row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix

In [3]:
df_ratings = pd.read_csv("NLP_cleaned_ratings.csv", low_memory=False)
df_items = pd.read_csv("NLP_cleaned_items.csv", low_memory=False)

In [4]:
main = np.random.rand(len(df_ratings)) < 0.7

train = df_ratings[main]

test = df_ratings[~main]

In [5]:
print train.shape
print test.shape

(142332, 506)
(61038, 506)


In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer

In [7]:
# vectorize data and convert them to csr matrix
X_train, ix = vectorize_dic({'users': train.user_id.values, 'items': train.item_id.values})
X_test, ix = vectorize_dic({'users': test.user_id.values, 'items': test.item_id.values}, ix, X_train.shape[1])
y_train = train.rating.values
y_test= test.rating.values

In [8]:
X_train.shape

(142332, 112114)

In [9]:
X_train = X_train.todense()
X_test = X_test.todense()

# print shape of data
print X_train.shape
print X_test.shape

MemoryError: 

In [None]:
np.savetxt('fm_Xtrain.txt', X_train, fmt='%d')
np.savetxt('fm_Xtest.txt', X_test, fmt='%d')
np.savetxt('fm_ytrain.txt', y_train, fmt='%d')
np.savetxt('fm_ytest.txt', y_test, fmt='%d')