# Scikit-learn API for xLearn

## 1. FM

In [1]:
import numpy as np
import xlearn as xl
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset
wine_data = load_wine()
X = wine_data['data']
y = (wine_data['target'] == 1)

X_train,    \
X_val,      \
y_train,    \
y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# Standardize input
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [3]:
# param:
#  0. binary classification
#  1. model scale: 0.1
#  2. epoch number: 10 (auto early-stop)
#  3. number of latent factor: 4
#  4. learning rate: 0.1
#  5. regular lambda: 0.01
#  6. use sgd optimization method
#  7. evaluation metric: accuarcy
fm_model = xl.FMModel(task='binary', init=0.1, 
                      epoch=10, k=4, lr=0.1, 
                      reg_lambda=0.01, opt='sgd', 
                      metric='acc')
# Start to train
fm_model.fit(X_train, 
             y_train, 
             eval_set=[X_val, y_val])

# print model weights
print(fm_model.weights)

# Generate predictions
y_pred = fm_model.predict(X_val)

(array([-0.75535  , -2.25687  , -1.07267  , -1.25076  ,  0.866634 ,
       -0.417836 , -0.142378 ,  0.0926271,  0.0146115,  0.551051 ,
       -2.19609  ,  1.16581  ,  0.456869 , -2.04465  ]), array([[ 5.52329e-05,  7.32132e-05,  7.76984e-03,  9.70645e-03],
       [ 1.49440e-02,  5.27875e-03,  8.63731e-03,  7.81847e-03],
       [ 4.01652e-03,  9.82996e-03,  1.90508e-03,  7.42801e-03],
       [ 7.65485e-03,  1.13894e-02,  7.19751e-03,  7.48228e-03],
       [ 1.06323e-02,  1.21860e-02,  9.20925e-03,  1.17253e-02],
       [ 3.27825e-03,  3.96365e-03,  1.09761e-02,  6.63039e-03],
       [ 1.12452e-02,  1.01429e-02,  1.56389e-03,  8.79840e-03],
       [ 1.20352e-02,  5.11132e-03,  1.17322e-02,  6.49371e-03],
       [ 3.58344e-03,  5.51422e-03,  4.32805e-03,  8.53096e-03],
       [ 9.44184e-03,  8.48235e-03,  8.43936e-03,  3.37314e-03],
       [ 5.91603e-03,  2.25785e-03,  6.22225e-03,  7.84393e-03],
       [ 4.85971e-03,  6.66072e-03, -5.69436e-04, -1.32506e-03],
       [ 3.44607e-03,  9.261

In [6]:
y_pred

array([0.0815506, 0.0852567, 0.822443 , 0.111794 , 0.47848  , 0.467585 ,
       0.0440372, 0.163691 , 0.914426 , 0.888276 , 0.121464 , 0.154142 ,
       0.0379011, 0.718097 , 0.0516162, 0.95089  , 0.0938677, 0.039652 ,
       0.519293 , 0.114238 , 0.691073 , 0.163509 , 0.189456 , 0.900392 ,
       0.582524 , 0.907366 , 0.811249 , 0.938447 , 0.557252 , 0.0663241,
       0.0684646, 0.0768457, 0.654635 , 0.028351 , 0.0831506, 0.341015 ])

# 2. FFM

In [10]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [11]:
# set path
default_path = "/Users/mayritaspring/Desktop/Github/Data/Recommender System_GBDT+FM"
import os
os.chdir(default_path)

In [12]:
train_data = pd.read_csv('./raw/train_tiny.txt')
test_data = pd.read_csv('./raw/test_tiny.txt')

In [13]:
#Training Data
num_col_tr = train_data.iloc[:,2:15]
cat_col_tr = train_data.iloc[:,15:41]

num_col_tr = pd.DataFrame(num_col_tr.fillna(num_col_tr.mean()))
cat_col_tr = pd.DataFrame(cat_col_tr.fillna(0))
all_col_tr = pd.concat([num_col_tr,cat_col_tr],axis=1)

#Testing Data
num_col_te = test_data.iloc[:,2:15]
cat_col_te = test_data.iloc[:,15:41]

num_col_te = pd.DataFrame(num_col_te.fillna(num_col_tr.mean()))
cat_col_te = pd.DataFrame(cat_col_te.fillna(0))
all_col_te = pd.concat([num_col_te,cat_col_te],axis=1)

In [14]:
# Based on Kaggle kernel by Scirpus
def convert_to_ffm(df,type,numerics,categories,features):
    currentcode = len(numerics)
    catdict = {}
    catcodes = {}
    # Flagging categorical and numerical fields
    for x in numerics:
         catdict[x] = 0
    for x in categories:
         catdict[x] = 1
    
    nrows = df.shape[0]
    ncolumns = len(features)
    with open(str(type) + "_ffm.txt", "w") as text_file:
# Looping over rows to convert each row to libffm format
        for n,r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow['Label']))
             # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(catdict.keys()):
                if(catdict[x]==0):
                    datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
                else:
            # For a new field appearing in a training example
                    if(x not in catcodes):
                        catcodes[x] = {}
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
            # For already encoded fields
                    elif(datarow[x] not in catcodes[x]):
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
                    code = catcodes[x][datarow[x]]
                    datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

In [None]:
train_data_Label = pd.concat([train_data.Label,all_col_tr],axis=1)
convert_to_ffm(train_data_Label,'Train',list(num_col_tr),list(cat_col_tr),list(all_col_tr))

test_data_Label = pd.concat([test_data.Label,all_col_te],axis=1)
convert_to_ffm(test_data_Label,'Test',list(num_col_te),list(cat_col_te),list(all_col_te))

In [7]:
import numpy as np
import xlearn as xl

In [15]:
# param:
#  0. binary classification
#  1. learning rate: 0.2
#  2. epoch number: 10 (auto early-stop)
#  3. evaluation metric: accuarcy
#  4. use sgd optimization method
ffm_model = xl.FFMModel(task='binary', 
                        lr=0.2, 
                        epoch=10, 
                        reg_lambda=0.002,
                        metric='acc')
# Start to train
# Directly use string to specify data source
ffm_model.fit("./data/Train_ffm.txt", 
              eval_set="./data/Test_ffm.txt")

# print model weights
print(ffm_model.weights)

# Generate predictions
y_pred = ffm_model.predict("./data/Test_ffm.txt")


(array([-1.27798e+00,  1.72038e-01,  1.23415e-01, ..., -7.77940e-07,
       -7.77940e-07, -7.77940e-07]), array([[1.12387e-06, 4.25162e-03, 3.00676e-02, ..., 4.52292e-03,
        5.21515e-04, 4.47602e-03],
       [1.28179e-03, 2.09363e-03, 2.93630e-03, ..., 2.91898e-04,
        1.64320e-03, 3.05204e-03],
       [4.99799e-03, 1.25453e-03, 2.20640e-03, ..., 1.82993e-03,
        1.20575e-03, 3.27071e-03],
       ...,
       [4.23189e-02, 2.83115e-02, 1.64207e-02, ..., 7.66672e-03,
        2.45897e-02, 4.39561e-02],
       [5.07423e-03, 3.24885e-02, 3.33328e-02, ..., 1.43414e-02,
        6.79581e-03, 3.42888e-02],
       [4.36742e-02, 3.02836e-02, 2.32984e-02, ..., 6.17970e-03,
        8.37131e-03, 2.66418e-02]]))


# 3. LR

In [17]:
import numpy as np
import xlearn as xl
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [18]:
# Load dataset
iris_data = load_iris()
X = iris_data['data']
y = (iris_data['target'] == 2)

X_train,   \
X_val,     \
y_train,   \
y_val = train_test_split(X, y, test_size=0.3, random_state=0)

In [19]:
# param:
#  0. binary classification
#  1. model scale: 0.1
#  2. epoch number: 10 (auto early-stop)
#  3. learning rate: 0.1
#  4. regular lambda: 1.0
#  5. use sgd optimization method
linear_model = xl.LRModel(task='binary', init=0.1, 
                          epoch=10, lr=0.1, 
                          reg_lambda=1.0, opt='sgd')

# Start to train
linear_model.fit(X_train, y_train, 
                 eval_set=[X_val, y_val], 
                 is_lock_free=False)

# print model weights
print(linear_model.weights)

# Generate predictions
y_pred = linear_model.predict(X_val)

(array([-0.679804 , -0.0963338, -0.264284 ,  0.490402 ,  0.214548 ]), None)
