In [14]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import normalize
from sklearn.decomposition import NMF
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split

In [15]:
csv_path = '../Data/'

df = pd.read_csv('../Pré-Processamento/pp_market.csv')
port1 = pd.read_csv(f'{csv_path}estaticos_portfolio1.csv', index_col='Unnamed: 0')
port2 = pd.read_csv(f'{csv_path}estaticos_portfolio2.csv', index_col='Unnamed: 0')
port3 = pd.read_csv(f'{csv_path}estaticos_portfolio3.csv', index_col='Unnamed: 0')

ids = df['id']
df.drop('id',1,inplace=True)

In [16]:
scaler = MaxAbsScaler()

df=pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df.index = ids

## Train test split

In [17]:
port1_train, port1_test = train_test_split(port1, test_size = 0.3, random_state = 42)
port2_train, port2_test = train_test_split(port2, test_size = 0.3, random_state = 42)
port3_train, port3_test = train_test_split(port3, test_size = 0.3, random_state = 42)

### Cos Sim

In [18]:
count = 0
for treino,teste,label in [(port1_train,port1_test,'Portfólio 1'),(port2_train,port2_test,'Portfólio 2'),(port3_train,port3_test,'Portfólio 3')]:
    print(label)
    transpose = df.loc[port1_train.id].T
    for num in [1000,4000,20000,40000]:
        count = 0
        rank = df.dot(transpose).mean(axis=1).nlargest(num).index
        for i in port1_test.id:
            if i in rank:
                count = count + 1
        print(f'{num}: {count/len(teste)*100}')

Portfólio 1
1000: 0.5988023952095809
4000: 0.5988023952095809
20000: 10.179640718562874
40000: 17.365269461077844
Portfólio 2
1000: 0.5882352941176471
4000: 0.5882352941176471
20000: 10.0
40000: 17.058823529411764
Portfólio 3
1000: 1.25
4000: 1.25
20000: 21.25
40000: 36.25


## NMF

In [19]:
def ranking(port, nlargest, df):
    port_train = port['id']
    transpose = df.loc[port_train.values].T
    similarities = df.dot(transpose)
    similarities = similarities.mean(axis=1)
    return similarities.nlargest(nlargest)

In [20]:
%%time
nmf = NMF(65)
nmf_features = nmf.fit_transform(df)
norm_features = normalize(nmf_features)
df_nmf = pd.DataFrame(norm_features, ids)

Wall time: 10min 23s


In [23]:
for train,test, label in [(port1_train,port1_test,'Portfolio 1'),
                         (port2_train,port2_test,'Portfolio 2'),
                         (port3_train,port3_test,'Portfolio 3')]:
    print(f'{label}:')
    for j in [1000,4000,20000,40000]:
        rank = ranking(train, j, df_nmf).index
        count = 0
        for i in test.id:
            if i in rank:
                count = count + 1
        print(f'{j}: {count/len(test)}')

Portfolio 1:
1000: 0.0
4000: 0.0
20000: 0.059880239520958084
40000: 0.10179640718562874
Portfolio 2:
1000: 0.13529411764705881
4000: 0.4235294117647059
20000: 0.7529411764705882
40000: 0.888235294117647
Portfolio 3:
1000: 0.4
4000: 0.7125
20000: 0.9
40000: 0.925


### SVD

In [24]:
from sklearn.decomposition import TruncatedSVD
def ranking_svd(port, nlargest, df):
    port_train = port['id']
    transpose = df.loc[port_train.values].T
    similarities = df.dot(transpose)
    similarities = similarities.mean(axis=1)
    return similarities.nlargest(nlargest)

In [25]:
%%time
svd = TruncatedSVD(65)
nmf_features = svd.fit_transform(df)
norm_features = normalize(nmf_features)
df_svd = pd.DataFrame(norm_features, ids)
df_svd.to_csv('../Streamlit/df_svd.csv')

Wall time: 1min 13s


In [26]:
for train,test, label in [(port1_train,port1_test,'Portfolio 1'),
                         (port2_train,port2_test,'Portfolio 2'),
                         (port3_train,port3_test,'Portfolio 3')]:
    print(f'{label}:')
    for j in [1000,4000,20000,40000]:
        rank = ranking_svd(train,j,df_svd).index
        count = 0
        for i in test.id:
            if i in rank:
                count = count + 1
        print(f'{j}: {count/len(test)}')

Portfolio 1:
1000: 0.005988023952095809
4000: 0.023952095808383235
20000: 0.059880239520958084
40000: 0.1497005988023952
Portfolio 2:
1000: 0.09411764705882353
4000: 0.3352941176470588
20000: 0.7470588235294118
40000: 0.8352941176470589
Portfolio 3:
1000: 0.4375
4000: 0.725
20000: 0.9375
40000: 0.95
