In [1]:
import json
import numpy as np
import pandas as pd
from app import dataset_word2vec, dataset_tfid
from sklearn.metrics import precision_score, recall_score
from app.datasets import dataset_ratings_user
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, RidgeCV, BayesianRidge, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, precision_score, recall_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

In [2]:
from sklearn.ensemble import VotingRegressor
def evaluate(X, user, index, df_ratings):
    df_movies_u = dataset_ratings_user(X,df_ratings=df_ratings, user=user) 
    #train, test = train_test_split(df_movies_u , test_size=0.2)
    train = df_movies_u.loc[index[str(user)]['train'], :]
    test = df_movies_u.loc[index[str(user)]['test'], :]
    X_train = train.drop(columns=['rating_user','title'])
    y_train = train['rating_user']
    X_test = test.drop(columns=['rating_user','title'])
    y_test = test['rating_user']
    df_movies_u = None
    return train_predict(X_train, y_train, X_test, y_test)




def train_predict(X_train, y_train, X_test, y_test):

    precision = {}
    recall = {}
    y_true = (y_test >= 3) * 1

    def get_precision_recall(name, model):
        model = model.fit(X_train, y_train)
        y_pred = (model.predict(X_test) >= 3)*1
        precision[name] = precision_score(y_true, y_pred)
        recall[name] = recall_score(y_true, y_pred)
    ridge = GridSearchCV(Ridge(), {'alpha':[1e-3, 1e-2, 1e-1, 1]}, cv=5, iid=False)
    svr =  GridSearchCV(SVR(gamma='scale'),{'kernel':('linear', 'rbf'), 'C':[1, 10]}, cv=5, iid=False)
    elastic = GridSearchCV(ElasticNet(), {'alpha':[1e-3, 1e-2, 1e-1, 1]}, cv=5, iid=False)
    gbr = GridSearchCV(GradientBoostingRegressor(learning_rate=0.1, max_depth=1, random_state=0, loss='ls'), 
                       {'n_estimators':[50, 100, 150]}, cv=5, iid=False)
    #ada = GridSearchCV(AdaBoostRegressor(random_state=0, n_estimators=100), 
    #                   {'n_estimators':[50, 100, 150]}, cv=5, iid=False)
    voting  =VotingRegressor(estimators=[('ridge', ridge), ('svr', svr)
                                         ,('elastic', elastic), ('gbr', gbr)])
    get_precision_recall('ridge',ridge)
    #get_precision_recall('lasso', GridSearchCV(Lasso(), {'alpha':[1e-3, 1e-2, 1e-1, 1]}))
    get_precision_recall('svr', svr)
    get_precision_recall('elastic', elastic)
    #get_precision_recall('sgd', GridSearchCV(SGDRegressor(), {'alpha':[1e-3, 1e-2, 1e-1, 1]}))
    #get_precision_recall('ada', ada)
    get_precision_recall('gbr', gbr)
    get_precision_recall('voting', voting)
    
    return precision, recall


def evaluate_method1(X, users, index, df_ratings):
    precisions, recalls = [], []
    i = 0
    for user in users:
        precision, recall = evaluate(X, int(user), index, df_ratings)
        precisions.append(precision)
        recalls.append(recall)
        
        print('.', end='')
        if (i+1) % 25 == 0:
            print()
        i += 1
    print()   
    
    df_precisions = pd.DataFrame(precisions, index=users, columns=list(precisions[0].keys()))
    df_recalls = pd.DataFrame(recalls, index=users, columns=list(recalls[0].keys()))
    return df_precisions, df_recalls

In [3]:
index = {}
with open('./app/datasets/index_sample.txt') as json_file:
    index = json.load(json_file)
len(index.keys())


25

In [4]:
df_ratings = pd.read_table('./app/datasets/ml-1m/ratings.dat', delimiter='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')
import warnings
warnings.filterwarnings('ignore')
import itertools
all_feats = ['genres', 'rating', 'runtimes', 'year']
combinations = []
for i in range(5):
    for c in itertools.combinations(all_feats,i):
        combinations.append(list(c))

In [66]:
def evaluate_method_combinations(n_features, method, combinations):
    if method == 'tfid':
        extract = dataset_tfid
        n_features=n_features
        n_words=n_features
    else:
        extract = dataset_word2vec
        n_words=2000
        
    # df = pd.DataFrame([], columns=['ridge', 'lasso', 'elastic', 'svr', 'sgd'])
    new_feats = None
    precision = None
    recall = None
    for i  in range(len(combinations)):
        print(combinations[i])
        df_movies, new_feats = extract(combinations[i], op='sum', n_features=n_features, n_words=n_words, new_feats=new_feats)
        df_precisions, df_recalls = evaluate_method1(df_movies, index.keys(), index, df_ratings)
        if i == 0:
            precision =  pd.DataFrame([], columns=df_precisions.columns)
            recall = pd.DataFrame([], columns=df_recalls.columns)
        precision = precision.append(df_precisions.mean(), ignore_index=True)
        recall = recall.append(df_recalls.mean(), ignore_index=True)
    precision.to_csv('app/recomendacoes/experiments/content_profile/precision-'+ str(method) + '-' +str(n_features) + '.csv')
    recall.to_csv('app/recomendacoes/experiments/content_profile/recall-'+ str(method) + '-' + str(n_features) + '.csv')
    return precision, recall

In [74]:
%%time
precision, recall = evaluate_method_combinations(300, 'w2v', combinations)

[]
Time to build vocab: 0.02 mins
Time to train the model: 1.07 mins
Time to compute vectors: 0.66 mins
.........................

['genres']
.........................

['rating']
.........................

['runtimes']
.........................

['year']
.........................

['genres', 'rating']
.........................

['genres', 'runtimes']
.........................

['genres', 'year']
.........................

['rating', 'runtimes']
.........................

['rating', 'year']
.........................

['runtimes', 'year']
.........................

['genres', 'rating', 'runtimes']
.........................

['genres', 'rating', 'year']
.........................

['genres', 'runtimes', 'year']
.........................

['rating', 'runtimes', 'year']
.........................

['genres', 'rating', 'runtimes', 'year']
.........................

CPU times: user 1h 53min 44s, sys: 36min 39s, total: 2h 30min 24s
Wall time: 1h 19min 33s


In [75]:
precision.max()

ridge      0.921168
svr        0.918166
elastic    0.919601
gbr        0.915992
voting     0.918046
dtype: float64

In [68]:
precision.max()

ridge      0.923433
svr        0.918161
elastic    0.919652
gbr        0.919084
voting     0.920066
dtype: float64

In [69]:
recall.max()

ridge      0.996364
svr        0.998540
elastic    1.000000
gbr        0.972395
voting     0.998788
dtype: float64

In [55]:
recall.idxmax()

ridge      3
svr        3
elastic    0
gbr        1
voting     0
dtype: int64

In [13]:
precision

Unnamed: 0,ridge,lasso,elastic,svr,sgd,gbr,voting
0,0.862299,,0.816252,0.86904,,0.862418,0.839133
1,0.878014,,0.8437,0.878829,,0.87751,0.878598
2,0.889934,,0.900159,0.889683,,0.893388,0.896694
3,0.863202,,0.818206,0.874337,,0.864257,0.861741
4,0.870162,,0.837775,0.861489,,0.871117,0.861513
5,0.894644,,0.892333,0.889625,,0.893965,0.897589
6,0.877977,,0.8437,0.879075,,0.875191,0.879723
7,0.881566,,0.851973,0.881984,,0.883094,0.885591
8,0.890708,,0.899945,0.891473,,0.89211,0.895902
9,0.890441,,0.899721,0.893727,,0.897245,0.89765


In [None]:
p