## Load libraries and test basic pywFM functions

In [1]:
import numpy as np
import pandas as pd
import csv
import cPickle as pickle

import pywFM

from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.datasets import dump_svmlight_file
from sklearn.cross_validation import train_test_split

from fastFM.mcmc import FMClassification, FMRegression
from pyfm import pylibfm

ModuleNotFoundError: No module named 'cPickle'

In [7]:
features = np.matrix([
#     Users  |     Movies     |    Movie Ratings   | Time | Last Movies Rated
#    A  B  C | TI  NH  SW  ST | TI   NH   SW   ST  |      | TI  NH  SW  ST
    [1, 0, 0,  1,  0,  0,  0,   0.3, 0.3, 0.3, 0,     13,   0,  0,  0,  0 ],
    [1, 0, 0,  0,  1,  0,  0,   0.3, 0.3, 0.3, 0,     14,   1,  0,  0,  0 ],
    [1, 0, 0,  0,  0,  1,  0,   0.3, 0.3, 0.3, 0,     16,   0,  1,  0,  0 ],
    [0, 1, 0,  0,  0,  1,  0,   0,   0,   0.5, 0.5,   5,    0,  0,  0,  0 ],
    [0, 1, 0,  0,  0,  0,  1,   0,   0,   0.5, 0.5,   8,    0,  0,  1,  0 ],
    [0, 0, 1,  1,  0,  0,  0,   0.5, 0,   0.5, 0,     9,    0,  0,  0,  0 ],
    [0, 0, 1,  0,  0,  1,  0,   0.5, 0,   0.5, 0,     12,   1,  0,  0,  0 ]
])
target = [5, 3, 1, 4, 5, 1, 5]

#### pywFM is a pythonic wrapper of original libFM, so we must set the path of the libFM before using it.

In [8]:

LIBFM_PATH = '/Users/Ming/Downloads/libfm/bin'

import sys
import os

sys.path.insert(0, LIBFM_PATH)
os.environ["PATH"] += os.pathsep + LIBFM_PATH
os.environ["LIBFM_PATH"] = LIBFM_PATH
    
print os.environ.get('LIBFM_PATH')

/Users/Ming/Downloads/libfm/bin


In [4]:
# default settings with low iteration numbers
# init_stdev=0.1, k0=1 (use bias), k1=1 (Use 1-way interactions), k2=8 (Dimensionality of 2-way interactions.), learning_method='mcmc'
fm = pywFM.FM(task='regression', num_iter=5) 

In [10]:
# split features and target for train/test
# first 5 are train, last 2 are test
model = fm.run(features[:5], target[:5], features[5:], target[5:])
print(model.predictions)
# you can also get the model weights
print(model.rlog)

1436
[3.73486, 3.10783]
      rmse      mae  time_pred  time_learn  time_learn2  time_learn4  \
0  2.82843  2.00000        NaN    0.000042     0.000043            0   
1  2.70995  1.91623        NaN    0.000029     0.000030            0   
2  2.60115  2.35341        NaN    0.000033     0.000037            0   
3  2.30156  2.27603        NaN    0.000028     0.000028            0   
4  2.35157  2.31351        NaN    0.000027     0.000027            0   

      alpha  rmse_mcmc_this  rmse_mcmc_all  rmse_mcmc_all_but5      ...       \
0  0.146083         2.82843        2.82843             2.82843      ...        
1  0.626736         2.59148        2.70995             2.82843      ...        
2  0.264591         3.26764        2.60115             2.82843      ...        
3  0.238489         2.82911        2.30156             2.82843      ...        
4  0.149502         2.57195        2.35157             2.82843      ...        

   vmu[0,3]  vlambda[0,3]  vmu[0,4]  vlambda[0,4]  vmu[0,5]  v

## Predict movie ratings from dataset on kaggle
## https://www.kaggle.com/c/predict-movie-ratings

In [13]:
csv_file = "train_v2.csv"
df_train = pd.read_csv(csv_file)
csv_file = "test_v2.csv"
df_test = pd.read_csv(csv_file)

In [14]:
df_train.head()

Unnamed: 0,ID,user,movie,rating
0,610739,3704,3784,3
1,324753,1924,802,3
2,808218,4837,1387,4
3,133808,867,1196,4
4,431858,2631,3072,5


In [15]:
len(df_train)

750156

In [16]:
df_test.head()

Unnamed: 0,ID,user,movie
0,895537,5412,2683
1,899740,5440,904
2,55688,368,3717
3,63728,425,1721
4,822012,4942,3697


In [17]:
USER = df_train['user'].values.max() #6040
MOVIE = df_train['movie'].values.max() #3952
print USER
print MOVIE

6040
3952


In [18]:
ratings = df_train['rating'].values
df_train = df_train.drop('rating', axis=1)

In [19]:
df_train.head()

Unnamed: 0,ID,user,movie
0,610739,3704,3784
1,324753,1924,802
2,808218,4837,1387
3,133808,867,1196
4,431858,2631,3072


In [20]:
ratings

array([3, 3, 4, ..., 3, 5, 4])

In [21]:
trainX, testX, trainY, testY = train_test_split(df_train, ratings, train_size=0.75, random_state=42)

In [22]:
trainX.head()
len(trainX)

562617

In [23]:
# Convert data to vector format used in FM using one-hot encoding
encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)

In [27]:
trainX.toarray()

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

### In original paper, author suggest to test several values for init stdev (e.g., 0.1, 0.2, 0.5, 1.0).

In [33]:
fm_dict_based_on_stdev = {}
# std_dev = 0.1
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=0.1)
fm_dict_based_on_stdev['0.1'] = fm

# std_dev = 0.2
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=0.2)
fm_dict_based_on_stdev['0.2'] = fm

# std_dev = 0.5
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=0.5)
fm_dict_based_on_stdev['0.5'] = fm

# std_dev = 1.0
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=1.0)
fm_dict_based_on_stdev['1.0'] = fm

In [34]:
results = pd.DataFrame()
for ele in fm_dict_based_on_stdev:
    fm = fm_dict_based_on_stdev[ele]
    model = fm.run(trainX, trainY, testX, testY)
    predictions = model.predictions
    results.ix['std_dev=' + ele, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
results

1569


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


1533
1553
1538


Unnamed: 0,RMSE
std_dev=0.1,0.149156
std_dev=1.0,0.591996
std_dev=0.2,0.243113
std_dev=0.5,0.418174


### In original paper, author suggest that one should start with a low factorization dimensionality (e.g., k = 8) 
### After an appropriate init stdev has been determined, MCMC can be run with larger factorization dimensionality k.

In [35]:
fm_dict_based_on_k = {}
# k= 3
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=0.1, k2=3)
fm_dict_based_on_k['3'] = fm

# k = 8
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=0.1, k2=8)
fm_dict_based_on_k['8'] = fm

# k = 30
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=0.1, k2=30)
fm_dict_based_on_k['30'] = fm

# k = 100
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=0.1, k2=100)
fm_dict_based_on_k['100'] = fm

In [36]:
results = pd.DataFrame()
for ele in fm_dict_based_on_k:
    fm = fm_dict_based_on_k[ele]
    model = fm.run(trainX, trainY, testX, testY)
    predictions = model.predictions
    results.ix['k=' + ele, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
results

1560


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


974
4321
13056


Unnamed: 0,RMSE
k=8,0.146961
k=3,0.157324
k=30,0.140865
k=100,0.13611


## Test bigger k

In [39]:
fm_dict_based_on_k = {}
# k = 100
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=0.1, k2=100)
fm_dict_based_on_k['100'] = fm

# k = 150
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=0.1, k2=150)
fm_dict_based_on_k['500'] = fm

# k = 200
fm = pywFM.FM(task='regression', num_iter=5, init_stdev=0.1, k2=200)
fm_dict_based_on_k['1000'] = fm

In [40]:
results = pd.DataFrame()
for ele in fm_dict_based_on_k:
    fm = fm_dict_based_on_k[ele]
    model = fm.run(trainX, trainY, testX, testY)
    predictions = model.predictions
    results.ix['k=' + ele, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
results

13072


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


19433
25800


Unnamed: 0,RMSE
k=100,0.1382
k=500,0.138394
k=1000,0.144932


### Test kaggle dataset with different FM libraries : original libFM, fastFM and pyFM

In [9]:
LIBFM_PATH = '/Users/Ming/Downloads/libfm/bin/libFM'
PYLIBFM_PATH = '/Users/Ming/Downloads/pyFM-master/'

import sys
if PYLIBFM_PATH not in sys.path:
    sys.path.insert(0, PYLIBFM_PATH)



def fitpredict_logistic(trainX, trainY, testX, classification=True, n_iter=100, rank=8, **params):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = LogisticRegression(**params)
        clf.fit(trainX, trainY)
        return clf.predict_proba(testX)[:, 1]
    else:
        clf = Ridge(**params)
        clf.fit(trainX, trainY)
        return clf.predict(testX)

def fitpredict_fastfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = FMClassification(rank=rank, n_iter=n_iter)
        return clf.fit_predict_proba(trainX, trainY, testX)
    else:
        clf = FMRegression(rank=rank, n_iter=n_iter)
        return clf.fit_predict(trainX, trainY, testX)  

def fitpredict_libfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    train_file = 'libfm_train.txt'
    test_file = 'libfm_test.txt'
    with open(train_file, 'w') as f:
        dump_svmlight_file(trainX, trainY, f=f)
    with open(test_file, 'w') as f:
        dump_svmlight_file(testX, np.zeros(testX.shape[0]), f=f)
    task = 'c' if classification else 'r'
    console_output = !$LIBFM_PATH -task $task -method mcmc -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -out output.libfm
    
    libfm_pred = pd.read_csv('output.libfm', header=None).values.flatten()
    return libfm_pred

def fitpredict_pylibfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    task = 'classification' if classification else 'regression'
    fm = pylibfm.FM(num_factors=rank, num_iter=n_iter, verbose=False, task=task)
    if classification:
        fm.fit(trainX, trainY)
    else:
        fm.fit(trainX, trainY * 1.)
    return fm.predict(testX)

def firpredict_pywFM(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    task = 'classification' if classification else 'regression'
    
    # std_dev = 0.1, num_iter=100, k=100
    fm = pywFM.FM(task=task, num_iter=n_iter, init_stdev=0.1, k2=rank)
    
    model = fm.run(trainX, trainY, testX, testY)
    predictions = model.predictions
    return np.array(predictions)

In [10]:
from collections import OrderedDict
import time

all_results = OrderedDict()
try:
    with open('./saved_results.pkl') as f:
        all_results = pickle.load(f)
except:
    pass

def test_on_dataset(trainX, testX, trainY, testY, task_name, predictionList=False, classification=True, use_pylibfm=True, k=8, n_iter=100):
    algorithms = OrderedDict()
    algorithms['logistic'] = fitpredict_logistic
    algorithms['libFM'] = fitpredict_libfm
    algorithms['fastFM'] = fitpredict_fastfm
    algorithms['pywFM'] = firpredict_pywFM
    if use_pylibfm:
        algorithms['pylibfm']  = fitpredict_pylibfm
    
    results = pd.DataFrame()
    if predictionList == False:
        for name, fit_predict in algorithms.items():
            start = time.time()
            if name == 'pylibfm':
                n_iter = 10
            predictions = fit_predict(trainX, trainY, testX, classification=classification, rank=k, n_iter=n_iter)
            spent_time = time.time() - start
            results.ix[name, 'time'] = spent_time
            if classification:
                results.ix[name, 'ROC AUC'] = roc_auc_score(testY, predictions)
            else:
                results.ix[name, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
    else:
        for name, fit_predict in algorithms.items():
            start = time.time()
            predictions = fit_predict(trainX, trainY, testX, classification=classification, rank=k, n_iter=n_iter)
            print name
            if name == 'pylibfm':
                n_iter = 10
            predictionList.append(predictions)
            spent_time = time.time() - start
            results.ix[name, 'time'] = spent_time
            if classification:
                results.ix[name, 'ROC AUC'] = roc_auc_score(testY, predictions)
            else:
                results.ix[name, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
            
    all_results[task_name] = results
    with open('saved_results.pkl', 'w') as f:
        pickle.dump(all_results, f)
        
    return results

### 100k Movielens dataset with only IDs

In [12]:
import load_problems

trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)
trainX.head()

Unnamed: 0,user,movie
98980,810,900
69824,803,754
9928,51,286
75599,734,180
95621,896,95


In [51]:
predictions = False
test_on_dataset(trainX, testX, trainY, testY, predictionList=predictions, task_name='ml100k, ids', classification=False, k=8, n_iter=10)

logistic


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


libFM
fastFM
24273
pywFM
pylibfm


Unnamed: 0,time,RMSE
logistic,1.420245,0.942664
libFM,5.950376,0.914565
fastFM,3.587717,0.915184
pywFM,5.090981,0.914976
pylibfm,181.74815,0.928005


### 100k Movielens dataset with all features 

In [58]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=True)
trainX.head()

Unnamed: 0,user,movie,age,gender,occupation,zip,released,unknown,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
98980,692,1310,33,0,7,615,68,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69824,931,528,48,1,3,59,57,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9928,216,553,12,1,13,110,67,0,1,1,...,0,0,0,0,0,0,0,0,0,0
75599,798,498,39,0,0,166,30,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95621,910,547,27,0,20,397,68,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [59]:
predictions = False
test_on_dataset(trainX, testX, trainY, testY, predictionList=predictions, task_name='ml100k, ids', classification=False, k=8, n_iter=10)

logistic


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


libFM
fastFM
24635
pywFM
pylibfm


Unnamed: 0,time,RMSE
logistic,4.04133,0.942356
libFM,35.871259,0.895329
fastFM,36.188362,0.896543
pywFM,34.582873,0.89623
pylibfm,612.590105,


### Predicting movie ratings on kaggle

In [139]:
csv_file = "train_v2.csv"
df_train = pd.read_csv(csv_file)
csv_file = "test_v2.csv"
df_test = pd.read_csv(csv_file)

ratings = df_train['rating'].values
df_train = df_train.drop('rating', axis=1)

trainX = df_train
testX = df_test
trainY = ratings
testY = [0] * len(df_test)


In [96]:
predictions = []
test_on_dataset(trainX, testX, trainY, testY, task_name='ml100k, ids', predictionList=predictions, classification=False, k=10, n_iter=100)

logistic


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


libFM
fastFM
28783
pywFM
pylibfm


Unnamed: 0,time,RMSE
logistic,42.064652,3.642214
libFM,371.187099,3.63741
fastFM,299.809937,3.636657
pywFM,398.301625,3.060715
pylibfm,2201.824196,3.621353


In [98]:
logistics_output = predictions[0].tolist()
r = csv.reader(open('sampleSubmission.csv')) # Here your csv file
lines = list(r)
for ele in range(1, len(lines)):
    lines[ele][1] = logistics_output[ele - 1]
output_df = pd.DataFrame(lines)
output_df.to_csv('logistics_output.csv', sep=',', encoding='utf-8')

libFM_output = predictions[1].tolist()
r = csv.reader(open('sampleSubmission.csv')) # Here your csv file
lines = list(r)
for ele in range(1, len(lines)):
    lines[ele][1] = libFM_output[ele - 1]
output_df = pd.DataFrame(lines)
output_df.to_csv('libFM_output.csv', sep=',', encoding='utf-8')

fastFM_output = predictions[2].tolist()
r = csv.reader(open('sampleSubmission.csv')) # Here your csv file
lines = list(r)
for ele in range(1, len(lines)):
    lines[ele][1] = fastFM_output[ele - 1]
output_df = pd.DataFrame(lines)
output_df.to_csv('fastFM_output.csv', sep=',', encoding='utf-8')

pywfm_output = predictions[3].tolist()
r = csv.reader(open('sampleSubmission.csv')) # Here your csv file
lines = list(r)
for ele in range(1, len(lines)):
    lines[ele][1] = pywfm_output[ele - 1]
output_df = pd.DataFrame(lines)
output_df.to_csv('pywfm_output.csv', sep=',', encoding='utf-8')

pylibfm_output = predictions[4].tolist()
r = csv.reader(open('sampleSubmission.csv')) # Here your csv file
lines = list(r)
for ele in range(1, len(lines)):
    lines[ele][1] = pylibfm_output[ele - 1]
output_df = pd.DataFrame(lines)
output_df.to_csv('pylibfm_output.csv', sep=',', encoding='utf-8')


## Using pywFM with and predict movie ratings on kaggle

In [23]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)
trainX.head()

Unnamed: 0,user,movie
98980,810,900
69824,803,754
9928,51,286
75599,734,180
95621,896,95


In [24]:
# std_dev = 0.1, num_iter=100, k=100
fm = pywFM.FM(task='regression', num_iter=100, init_stdev=0.1, k2=10)

In [25]:
encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)

results = pd.DataFrame()

model = fm.run(trainX, trainY, testX, testY)
predictions = model.predictions
pairwise_interactions = model.pairwise_interactions
results.ix['# std_dev = 0.1, num_iter=100, k=10', 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
results

28258


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,RMSE
"all_features, std_dev = 0.1, num_iter=10, k=10",0.912553


In [26]:
predictions

[3.59531,
 3.58228,
 3.49575,
 2.9548,
 4.01764,
 3.20555,
 3.73039,
 3.65963,
 3.2745,
 3.1748,
 3.22775,
 3.49725,
 2.92341,
 3.77233,
 3.69498,
 3.1543,
 3.89064,
 4.00048,
 3.93978,
 3.15581,
 2.93242,
 2.20243,
 4.14297,
 4.15259,
 3.86421,
 4.5854,
 3.79871,
 3.43283,
 3.73203,
 3.98467,
 3.86203,
 3.62986,
 3.28127,
 3.59727,
 2.23942,
 3.15688,
 2.99423,
 4.14099,
 3.49789,
 4.53402,
 4.88729,
 3.36858,
 2.80027,
 3.20414,
 4.11338,
 3.30153,
 3.29309,
 3.87356,
 3.45875,
 3.26122,
 4.10685,
 2.99678,
 4.56854,
 3.48885,
 3.00547,
 4.45825,
 3.12027,
 2.68,
 1.92525,
 3.66315,
 3.02143,
 3.55965,
 4.32108,
 3.64683,
 3.6316,
 3.4364,
 2.41812,
 2.62668,
 4.14069,
 2.95648,
 2.58177,
 3.44016,
 3.47329,
 3.83712,
 2.69123,
 3.64794,
 4.31472,
 3.6483,
 2.79355,
 3.50956,
 2.22028,
 3.18921,
 4.37719,
 4.45581,
 3.06225,
 3.11321,
 3.20378,
 2.81923,
 3.09242,
 3.47367,
 4.09436,
 3.28582,
 3.55139,
 3.31472,
 4.37084,
 2.61854,
 3.59835,
 3.89279,
 3.51423,
 4.31916,
 3.75341,
 

### learning rate of SGD

In [51]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)

predictions = []

encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)
task = 'regression'
n_iter = 50
init_stdev = 0.1
k = 8
learning_method = 'sgd'
learn_rate = {'0.001': 0.001, '0.005': 0.005, '0.05': 0.05, '0.1': 0.1, '0.5': 0.5}

results = pd.DataFrame()

for ele in learn_rate:
    start = time.time()
    fm = pywFM.FM(task=task, num_iter=n_iter, init_stdev=init_stdev, k2=k, learning_method=learning_method, learn_rate=learn_rate[ele])
    model = fm.run(trainX, trainY, testX, testY)
    spent_time = time.time() - start
    predictions = np.array(model.predictions)
    results.ix[ele, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
    results.ix[ele, 'time'] = spent_time

results



2647


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


2670
2554
2593
2643


Unnamed: 0,RMSE,time
0.1,1.10486,2.762752
0.005,0.971745,2.960273
0.5,1.849443,2.775481
0.001,0.995492,3.012715
0.05,1.063344,2.979575


### bias regularization of SGD

In [59]:
# csv_file = "train_v2.csv"
# df_train = pd.read_csv(csv_file)
# csv_file = "test_v2.csv"
# df_test = pd.read_csv(csv_file)

# ratings = df_train['rating'].values
# df_train = df_train.drop('rating', axis=1)

# trainX = df_train
# testX = df_test
# trainY = ratings
# testY = [0] * len(df_test)

trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)

predictions = []

encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)
task = 'regression'
n_iter = 50
init_stdev = 0.1
k = 8
learning_method = 'sgd'
learn_rate = 0.005
bias_regularization = {'0': 0, '10': 10, '50': 50, '100': 100}

results = pd.DataFrame()

for ele in bias_regularization:
    start = time.time()
    fm = pywFM.FM(task=task, num_iter=n_iter, init_stdev=init_stdev, k2=k, learning_method=learning_method, learn_rate=learn_rate, r0_regularization=bias_regularization[ele])
    model = fm.run(trainX, trainY, testX, testY)
    spent_time = time.time() - start
    predictions = np.array(model.predictions)
    results.ix[ele, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
    results.ix[ele, 'time'] = spent_time

results


2663


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


2677
2673
2673


Unnamed: 0,RMSE,time
0,0.971011,2.991727
100,0.967701,2.820408
50,0.965741,3.257063
10,0.967208,2.804631


### one way regularization of SGD

In [62]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)

predictions = []

encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)
task = 'regression'
n_iter = 50
init_stdev = 0.1
k = 8
learning_method = 'sgd'
learn_rate = 0.005
bias_regularization = 100
one_way_regularization = {'0': 0, '10': 10, '50': 50, '100': 100}

results = pd.DataFrame()

for ele in one_way_regularization:
    start = time.time()
    fm = pywFM.FM(task=task, num_iter=n_iter, init_stdev=init_stdev, k2=k, learning_method=learning_method, learn_rate=learn_rate, r0_regularization=bias_regularization,  r1_regularization=one_way_regularization[ele])
    model = fm.run(trainX, trainY, testX, testY)
    spent_time = time.time() - start
    predictions = np.array(model.predictions)
    results.ix[ele, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
    results.ix[ele, 'time'] = spent_time

results

2673


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


2661
2658
2667


Unnamed: 0,RMSE,time
0,0.968792,3.038099
100,0.96883,3.292176
50,0.966954,2.863245
10,0.966263,2.876794


### two way regularization of SGD

In [63]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)

predictions = []

encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)
task = 'regression'
n_iter = 50
init_stdev = 0.1
k = 8
learning_method = 'sgd'
learn_rate = 0.005
bias_regularization = 100
one_way_regularization = 0
two_way_regularization = {'0': 0, '10': 10, '50': 50, '100': 100}

results = pd.DataFrame()

for ele in two_way_regularization:
    start = time.time()
    fm = pywFM.FM(task=task, num_iter=n_iter, init_stdev=init_stdev, k2=k, learning_method=learning_method, learn_rate=learn_rate, r0_regularization=bias_regularization,  r1_regularization=one_way_regularization, r2_regularization=two_way_regularization[ele])
    model = fm.run(trainX, trainY, testX, testY)
    spent_time = time.time() - start
    predictions = np.array(model.predictions)
    results.ix[ele, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
    results.ix[ele, 'time'] = spent_time

results

2671


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


2676
2667
2669


Unnamed: 0,RMSE,time
0,0.975059,3.073827
100,0.946672,3.121189
50,0.946672,3.271239
10,0.946672,3.172698


### bias regularization of ALS

In [64]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)

predictions = []

encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)
task = 'regression'
n_iter = 50
init_stdev = 0.1
k = 8
learning_method = 'als'
bias_regularization = {'0': 0, '10': 10, '50': 50, '100': 100}

results = pd.DataFrame()

for ele in bias_regularization:
    start = time.time()
    fm = pywFM.FM(task=task, num_iter=n_iter, init_stdev=init_stdev, k2=k, learning_method=learning_method, r0_regularization=bias_regularization[ele])
    model = fm.run(trainX, trainY, testX, testY)
    spent_time = time.time() - start
    predictions = np.array(model.predictions)
    results.ix[ele, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
    results.ix[ele, 'time'] = spent_time

results



5417


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


5415
5422
5416


Unnamed: 0,RMSE,time
0,1.130843,3.777833
100,1.121292,4.572016
50,1.123972,4.217432
10,1.12133,3.703119


### one way regularization of ALS

In [65]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)

predictions = []

encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)
task = 'regression'
n_iter = 50
init_stdev = 0.1
k = 8
learning_method = 'als'
bias_regularization = 100
one_way_regularization = {'0': 0, '10': 10, '50': 50, '100': 100}

results = pd.DataFrame()

for ele in one_way_regularization:
    start = time.time()
    fm = pywFM.FM(task=task, num_iter=n_iter, init_stdev=init_stdev, k2=k, learning_method=learning_method,  r0_regularization=bias_regularization,  r1_regularization=one_way_regularization[ele])
    model = fm.run(trainX, trainY, testX, testY)
    spent_time = time.time() - start
    predictions = np.array(model.predictions)
    results.ix[ele, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
    results.ix[ele, 'time'] = spent_time

results




5399


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


5512
5454
5462


Unnamed: 0,RMSE,time
0,1.118749,3.411542
100,1.093325,3.396946
50,1.094243,3.466309
10,1.098897,4.194343


### two way regularization of ALS

In [67]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)

predictions = []

encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)
task = 'regression'
n_iter = 50
init_stdev = 0.1
k = 8
learning_method = 'als'
bias_regularization = 100
one_way_regularization = 100
two_way_regularization = {'0': 0, '10': 10, '50': 50, '100': 100}

results = pd.DataFrame()

for ele in two_way_regularization:
    start = time.time()
    fm = pywFM.FM(task=task, num_iter=n_iter, init_stdev=init_stdev, k2=k, learning_method=learning_method,  r0_regularization=bias_regularization,  r1_regularization=one_way_regularization, r2_regularization=two_way_regularization[ele])
    model = fm.run(trainX, trainY, testX, testY)
    spent_time = time.time() - start
    predictions = np.array(model.predictions)
    results.ix[ele, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
    results.ix[ele, 'time'] = spent_time

results





5510


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


6267
5859
6088


Unnamed: 0,RMSE,time
0,1.078348,3.826981
100,1.519506,4.364019
50,1.266685,3.850825
10,0.96315,3.914718


In [11]:
csv_file = "train_v2.csv"
df_train = pd.read_csv(csv_file)
csv_file = "test_v2.csv"
df_test = pd.read_csv(csv_file)

ratings = df_train['rating'].values
df_train = df_train.drop('rating', axis=1)

trainX = df_train
testX = df_test
trainY = ratings
testY = [0] * len(df_test)

encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)

In [12]:
# std_dev = 0.1, num_iter=100, k=100
fm = pywFM.FM(task='regression', num_iter=100, init_stdev=0.1, k2=100)
model = fm.run(trainX, trainY, testX, testY)
predictions_mcmc = model.predictions

KeyboardInterrupt: 

In [14]:
n_iter = 100
init_stdev = 0.1
k = 8
learning_method = 'sgd'
task = 'regression'
learn_rate = 0.005
bias_regularization = 100
one_way_regularization = 0
two_way_regularization =10
fm = pywFM.FM(task=task, num_iter=n_iter, init_stdev=init_stdev, k2=k, learning_method=learning_method, learn_rate=learn_rate, r0_regularization=bias_regularization,  r1_regularization=one_way_regularization, r2_regularization=two_way_regularization)
model = fm.run(trainX, trainY, testX, testY)
predictions_sgd = model.predictions

5098


In [None]:
n_iter = 100
init_stdev = 0.1
k = 8
learning_method = 'als'
bias_regularization = 100
one_way_regularization = 100
two_way_regularization = 10
fm = pywFM.FM(task=task, num_iter=n_iter, init_stdev=init_stdev, k2=k, learning_method=learning_method, r0_regularization=bias_regularization,  r1_regularization=one_way_regularization, r2_regularization=two_way_regularization)
model = fm.run(trainX, trainY, testX, testY)
predictions_als = model.predictions

In [36]:
r = csv.reader(open('sampleSubmission.csv')) # Here your csv file
lines = list(r)
for ele in range(1, len(lines)):
    lines[ele][1] =predictions[ele - 1]
output_df = pd.DataFrame(lines)
output_df.to_csv('pywFM.csv', sep=',', encoding='utf-8')