In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Concatenate, Flatten, BatchNormalization,LeakyReLU,Dropout,Average
from keras.losses import binary_crossentropy, mse, mae
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

mkt_url = 'raw/small_market.csv'
news_url = 'raw/small_newsdata.csv'

market_dtypes = {
    'assetCode': str,
    'assetName': str,
    'volume': float,
    'close': float,
    'open': float,
    'returnsClosePrevRaw1': float,
    'returnsOpenPrevRaw1': float,
    'returnsClosePrevMktres1': float,
    'returnsOpenPrevMktres1': float,
    'returnsClosePrevRaw10': float,
    'returnsOpenPrevRaw10': float,
    'returnsClosePrevMktres10': float,
    'returnsOpenPrevMktres10': float,
    'returnsOpenNextMktres10': float,
    'universe': float
}
market_date_cols = ['time']

news_dtypes = {
    'sourceId': str,
    'headline': str,
    'urgency': int,
    'takeSequence': int,
    'provider': str,
    'subjects': str,
    'audiences': str,
    'bodySize': int,
    'companyCount': int,
    'headlineTag': str,
    'marketCommentary': bool,
    'sentenceCount': int,
    'wordCount': int,
    'assetCodes': str,
    'assetName': str,
    'firstMentionSentence': int,
    'relevance': float,
    'sentimentClass': int,
    'sentimentNegative': float,
    'sentimentNeutral': float,
    'sentimentPositive': float,
    'sentimentWordCount': int,
    'noveltyCount12H': int,
    'noveltyCount24H': int,
    'noveltyCount3D': int,
    'noveltyCount5D': int,
    'noveltyCount7D': int,
    'volumeCounts12H': int,
    'volumeCounts24H': int,
    'volumeCounts3D': int,
    'volumeCounts5D': int,
    'volumeCounts7D': int}
news_date_cols = ['time', 'sourceTimestamp', 'firstCreated', ]

df_market_orig = pd.read_csv(mkt_url, dtype=market_dtypes, parse_dates=market_date_cols)
df_news_orig = pd.read_csv(news_url, dtype=news_dtypes, parse_dates=news_date_cols)

cat_cols = ['assetCode']
num_cols = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
                    'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10',
                    'returnsOpenPrevMktres10']

In [52]:
train_indices, val_indices = train_test_split(df_market_orig.index.values,test_size=0.3)
train_indices.shape, val_indices.shape
df_market_train = df_market_orig.loc[train_indices]
df_market_val = df_market_orig.loc[val_indices]
# train_indices.shape, df_market_train.shape, val_indices.shape, df_market_val.shape

In [53]:
# df_market_orig.dtypes

In [54]:
def encode(encoder, x):
    len_encoder = len(encoder)
    try:
        ind = encoder[x]
    except KeyError:
        ind = len_encoder
    return ind

encoders = [{} for cat in cat_cols]

for i, cat in enumerate(cat_cols):
    print('encoding %s'%cat, end='\n')
    encoders[i] = {l:ind for ind, l in enumerate(df_market_train[cat].unique())}
    df_market_train[cat] = df_market_train[cat].apply(lambda x:encode(encoders[i],x))
    df_market_val[cat] = df_market_val[cat].apply(lambda x:encode(encoders[i],x))
    
embed_sizes = [len(encoder) + 1 for encoder in encoders]

encoding assetCode


In [55]:
from sklearn.preprocessing import StandardScaler
df_market_train[num_cols] = df_market_train[num_cols].fillna(0)
scaler = StandardScaler()
df_market_train[num_cols] = scaler.fit_transform(df_market_train[num_cols])
df_market_val[num_cols] = scaler.fit_transform(df_market_val[num_cols])

  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


In [56]:
cat_inputs = []
for cat in cat_cols:
    cat_inputs.append(Input(shape=[1], name=cat))
    
cat_embs = []
for i,cat in enumerate(cat_cols):
    cat_embs.append(Embedding(embed_sizes[i], 10)(cat_inputs[i]))

cat_logits = Flatten()(cat_embs[0])
cat_logits = Dense(32)(cat_logits)
cat_logits = LeakyReLU(0.1)(cat_logits)
cat_logits = Dropout(0.5)(cat_logits)
num_input = Input(shape = (len(num_cols),), name='num')
num_logits = num_input
num_logits = BatchNormalization()(num_logits)
num_logits = Dense(32)(num_logits)
num_logits = LeakyReLU(0.1)(num_logits)
num_logits = Dropout(0.5)(num_logits)
all_logits = Concatenate()([num_logits, cat_logits])
logits = Dense(128, activation='relu')(all_logits)
logits = Dropout(0.5)(logits)
logits = Dense(64, activation='relu')(logits)
logits = Dropout(0.5)(logits)
out = Dense(1, activation='tanh')(logits)

model = Model(inputs = cat_inputs+[num_input],outputs=out)
model.compile(optimizer='adam', loss=mae)

In [68]:
# df_market_train.head()

df_market_train[num_cols].values.shape
df_market_train['returnsOpenNextMktres10'].apply(lambda x:0)

31    0
13    0
45    0
55    0
78    0
     ..
71    0
65    0
33    0
83    0
87    0
Name: returnsOpenNextMktres10, Length: 69, dtype: int64

In [69]:
def get_input(df):
    X_num = df[num_cols].values
    X = {'num':X_num}
    for cat in cat_cols:
        X[cat] = df[cat].values
    df['returnsOpenNextMktres10'] = df['returnsOpenNextMktres10'].apply(lambda x: 0 if x < -0.3 or x > 0.3 else x)
    df['returnsOpenNextMktres10'] = df['returnsOpenNextMktres10'].apply(lambda x: -1 if x < 0 else x)
    df['returnsOpenNextMktres10'] = df['returnsOpenNextMktres10'].apply(lambda x: 1 if x > 0 else x)
    y = df['returnsOpenNextMktres10'].values
    r = df['returnsOpenNextMktres10'].values
    u = df['universe']
    d = df['time'].dt.date
    return X,y,r,u,d
    
# r, u and d are used to calculate the scoring metric
X_train,y_train,r_train,u_train,d_train = get_input(df_market_train)
X_valid,y_valid,r_valid,u_valid,d_valid = get_input(df_market_val)

In [79]:
X_train['num'].shape
model.fit(X_train['num'], y_train)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([[ 1.14933317e-01, -2.03341265e-01, -2.13043470e-01,
         4.05538088e-01,  1.24931597e+00,  0.00000000e+00,
         0.00000000e+00, -1.04794978e+00, -1.07033907e+00,
         0.00000000e+0...

In [71]:
days = () # TODO
n_days = 0
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days += 1
    if n_days % 10 == 0:
        print(n_days,end='\n')
    # num features
    market_obs_df[num_cols] = market_obs_df[num_cols].fillna(0)
    market_obs_df[num_cols] = scaler.transform(market_obs_df[num_cols])
    X_num_test = market_obs_df[num_cols].values
    X_test = {'num':X_num_test}
    for i in range(len(cat_cols)):
        market_obs_df[cat_cols[i]+'_encoded'] = market_obs_df[cat_cols[i]].astype(str).apply(lambda x: encode(encoders[i], x))
        X_test[cat_cols[i]] = market_obs_df[cat_cols[i]+'_encoded']
    market_prediction = model.predict(X_test)
    preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'], 'confidence':market_prediction.reshape(-1)})
    predictions_template_df = predictions_template_df.merge(preds, how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)
env.write_submission_file()
print('Done!')


NameError: name 'env' is not defined