In [22]:
import numpy as np 
import dask as dk
import pandas as pd
import ast 
import pickle
#import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import chain

%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import keras
from keras.models import Sequential
from keras.layers import Dense
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns 
import gc
gc.enable()
sns.set()
sns.set_context(None)
plt.rcParams['figure.figsize'] = [25, 10]

In [23]:
# Will reduce data load for code test
toy = False
(market_train_df_cleaned, news_train_df_cleaned) = pd.read_pickle('./market_news_v2.p')

In [24]:
market_train_df_cleaned.shape, news_train_df_cleaned.shape

((4072956, 16), (9328750, 24))

In [25]:
# We will reduce the number of samples for memory reasons
if toy:
    market_train_df_cleaned = market_train_df_cleaned.tail(100000)
    news_train_df_cleaned = news_train_df_cleaned.tail(300000)
else:
    market_train_df_cleaned = market_train_df_cleaned.tail(3000000)
    news_train_df_cleaned = news_train_df_cleaned.tail(6000000)

In [26]:
news_cols_agg = {
    'urgency': ['min', 'count'],
    'relevance': ['min', 'max', 'mean', 'std'],
    'sentimentNegative': ['min', 'max', 'mean', 'std'],
    'sentimentNeutral': ['min', 'max', 'mean', 'std'],
    'sentimentPositive': ['min', 'max', 'mean', 'std'],
    'noveltyCount12H': ['min', 'max', 'mean', 'std'],
    'noveltyCount24H': ['min', 'max', 'mean', 'std'],
    'noveltyCount3D': ['min', 'max', 'mean', 'std'],
    'noveltyCount5D': ['min', 'max', 'mean', 'std'],
    'noveltyCount7D': ['min', 'max', 'mean', 'std'],
    'volumeCounts12H': ['min', 'max', 'mean', 'std'],
    'volumeCounts24H': ['min', 'max', 'mean', 'std'],
    'volumeCounts3D': ['min', 'max', 'mean', 'std'],
    'volumeCounts5D': ['min', 'max', 'mean', 'std'],
    'volumeCounts7D': ['min', 'max', 'mean', 'std']
}

In [28]:

def join_market_news(market_train_df, news_train_df):
    # Fix asset codes (str -> list)
    news_train_df['assetCodes'] = news_train_df['assetCodes'].str.findall("'([\w\./]+)'")    
    # Expand assetCodes
    assetCodes_expanded = list(chain(*news_train_df['assetCodes']))
    assetCodes_index = news_train_df.index.repeat( news_train_df['assetCodes'].apply(len) )
    assert len(assetCodes_index) == len(assetCodes_expanded)
    df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})
    # Create expandaded news (will repeat every assetCodes' row)
    news_cols = ['time', 'assetCodes'] + sorted(news_cols_agg.keys())
    news_train_df_expanded = pd.merge(df_assetCodes, news_train_df[news_cols], left_on='level_0', right_index=True, suffixes=(['','_old']))
    # Free memory
    del news_train_df, df_assetCodes
    # Aggregate numerical news features
    news_train_df_aggregated = news_train_df_expanded.groupby(['time', 'assetCode']).agg(news_cols_agg)
    # Free memory
    del news_train_df_expanded
    # Convert to float32 to save memory
    news_train_df_aggregated = news_train_df_aggregated.apply(np.float32)
    # Flat columns
    news_train_df_aggregated.columns = ['_'.join(col).strip() for col in news_train_df_aggregated.columns.values]
    # Join with train
    market_train_df = market_train_df.join(news_train_df_aggregated, on=['time', 'assetCode'])
    # Free memory
    del news_train_df_aggregated
    return market_train_df

In [30]:
def get_xy(market_train_df, news_train_df, le=None):
    x, le = get_x(market_train_df, news_train_df)
    y = market_train_df['returnsOpenNextMktres10'].clip(-1, 1)
    return x, y, le


def label_encode(series, min_count):
    vc = series.value_counts()
    le = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
    return le


def get_x(market_train_df, news_train_df, le=None):
    # Split date into before and after 22h (the time used in train data)
    # E.g: 2007-03-07 23:26:39+00:00 -> 2007-03-08 00:00:00+00:00 (next day)
    #      2009-02-25 21:00:50+00:00 -> 2009-02-25 00:00:00+00:00 (current day)
    news_train_df['time'] = (news_train_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')

    # Round time of market_train_df to 0h of curret day
    market_train_df['time'] = market_train_df['time'].dt.floor('1D')

    # Join market and news
    x = join_market_news(market_train_df, news_train_df)
    
    # If not label-encoder... encode assetCode
    if le is None:
        le_assetCode = label_encode(x['assetCode'], min_count=10)
        le_assetName = label_encode(x['assetName'], min_count=5)
    else:
        # 'unpack' label encoders
        le_assetCode, le_assetName = le
        
    x['assetCode'] = x['assetCode'].map(le_assetCode).fillna(-1).astype(int)
    x['assetName'] = x['assetName'].map(le_assetName).fillna(-1).astype(int)
    
    try:
        x.drop(columns=['returnsOpenNextMktres10'], inplace=True)
    except:
        pass
    try:
        x.drop(columns=['universe'], inplace=True)
    except:
        pass
    x['dayofweek'], x['month'] = x.time.dt.dayofweek, x.time.dt.month
    x.drop(columns='time', inplace=True)
#    x.fillna(-1000,inplace=True)
    
    return x, (le_assetCode, le_assetName)

In [31]:
%%time

# This will take some time...
X, y, le = get_xy(market_train_df_cleaned, news_train_df_cleaned)

CPU times: user 29.3 s, sys: 16.3 s, total: 45.6 s
Wall time: 4min 28s


In [35]:
X.shape, y.shape

((3000000, 73), (3000000,))

In [37]:
# Save universe data for latter use
universe = market_train_df_cleaned['universe']
time = market_train_df_cleaned['time']

# Free memory
del market_train_df_cleaned, news_train_df_cleaned

In [38]:
X_ = X

In [39]:
X = X_

In [43]:
X.shape

(3000000, 73)

In [41]:
n_train = int(X.shape[0] * 0.8)
X_train, y_train = X.iloc[:n_train], y.iloc[:n_train]
X_valid, y_valid = X.iloc[n_train:], y.iloc[n_train:]

In [42]:
# For valid data, keep only those with universe > 0. This will help calculate the metric
u_valid = (universe.iloc[n_train:] > 0)
t_valid = time.iloc[n_train:]

X_valid = X_valid[u_valid]
y_valid = y_valid[u_valid]
t_valid = t_valid[u_valid]
del u_valid

In [None]:
##################################################################################3

In [None]:
# Creat lgb datasets
train_cols = X.columns.tolist()
categorical_cols = [] # ['assetCode', 'assetName', 'dayofweek', 'month']

In [None]:
# Note: y data is expected to be a pandas Series, as we will use its group_by function in `sigma_score`
dtrain = lgb.Dataset(X_train.values, y_train, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)
dvalid = lgb.Dataset(X_valid.values, y_valid, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)

In [None]:
# We will 'inject' an extra parameter in order to have access to df_valid['time'] inside sigma_score without globals
dvalid.params = {
    'extra_time': t_valid.factorize()[0]
}

In [None]:
lgb_params = dict(
    objective = 'regression_l1',
    learning_rate = 0.1,
    num_leaves = 127,
    max_depth = -1,
#     min_data_in_leaf = 1000,
#     min_sum_hessian_in_leaf = 10,
    bagging_fraction = 0.75,
    bagging_freq = 2,
    feature_fraction = 0.5,
    lambda_l1 = 0.0,
    lambda_l2 = 1.0,
    metric = 'None', # This will ignore the loss objetive and use sigma_score instead,
    seed = 42 # Change for better luck! :)
)

def sigma_score(preds, valid_data):
    df_time = valid_data.params['extra_time']
    labels = valid_data.get_label()
    
#    assert len(labels) == len(df_time)

    x_t = preds * labels #  * df_valid['universe'] -> Here we take out the 'universe' term because we already keep only those equals to 1.
    
    # Here we take advantage of the fact that `labels` (used to calculate `x_t`)
    # is a pd.Series and call `group_by`
    x_t_sum = x_t.groupby(df_time).sum()
    score = x_t_sum.mean() / x_t_sum.std()

    return 'sigma_score', score, True

evals_result = {}
m = lgb.train(lgb_params, dtrain, num_boost_round=1000, valid_sets=(dvalid,), valid_names=('valid',), verbose_eval=25,
              early_stopping_rounds=100, feval=sigma_score, evals_result=evals_result)


df_result = pd.DataFrame(evals_result['valid'])

In [None]:
ax = df_result.plot(figsize=(12, 8))
ax.scatter(df_result['sigma_score'].idxmax(), df_result['sigma_score'].max(), marker='+', color='red')

In [None]:
num_boost_round, valid_score = df_result['sigma_score'].idxmax()+1, df_result['sigma_score'].max()
print(lgb_params)
print(f'Best score was {valid_score:.5f} on round {num_boost_round}')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 14))
lgb.plot_importance(m, ax=ax[0])
lgb.plot_importance(m, ax=ax[1], importance_type='gain')
fig.tight_layout()

In [None]:
# Train full model
dtrain_full = lgb.Dataset(X, y, feature_name=train_cols, categorical_feature=categorical_cols)
model = lgb.train(lgb_params, dtrain, num_boost_round=num_boost_round)

In [None]:
def make_predictions(predictions_template_df, market_obs_df, news_obs_df, le):
    x, _ = get_x(market_obs_df, news_obs_df, le)
    predictions_template_df.confidenceValue = np.clip(model.predict(x), -1, 1)

In [None]:
days = env.get_prediction_days()

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    make_predictions(predictions_template_df, market_obs_df, news_obs_df, le)
    env.predict(predictions_template_df)
print('Done!')

In [7]:
def get_all_stock_by_news_assetName(assetName,stocks=market_train_df_cleaned):
    # select all news releated to wal-mart
    News  = news_train_df_cleaned['assetName'].str.contains(assetName)
    # to get unique value from catigory
    CodesAsString = news_train_df_cleaned[News]['assetCodes'].unique()
    # convert value to object 
    Codes = ast.literal_eval(CodesAsString[0])
#     print(Codes)
    # get all assets has the same news assetcodes or any part 
    MarketStock = stocks["assetCode"].isin(Codes)
    del News, CodesAsString, Codes
    return stocks[MarketStock]
 
def get_all_stock_by_news_time(News, period ):
    """
    news : news item 
    period : the time range to get data in future and in the past 
    """
    NewsTime  = News['time']
    if(period < 0):
        # past
        time_limit   = pd.date_range(end  =NewsTime , periods=10, normalize = True )[0]
        stocks_by_period = market_train_df_cleaned[(market_train_df_cleaned['time'] > time_limit) &(market_train_df_cleaned['time'] <= NewsTime)]
    else:
        # future 
        time_limit = pd.date_range(start=NewsTime , periods=10, normalize = True )[-1]       
        stocks_by_period = market_train_df_cleaned[(market_train_df_cleaned['time'] > NewsTime) &( market_train_df_cleaned['time'] <= time_limit)]

    # select all news releated to News assetName
    stocks = get_all_stock_by_news_assetName(News['assetName'], stocks_by_period)
    del NewsTime, time_limit, stocks_by_period
    return stocks

In [5]:
market_train_df_cleaned.head(5)
news_train_df_cleaned.head(5)

Unnamed: 0,time,headline,urgency,takeSequence,subjects,audiences,headlineTag,assetCodes,assetName,relevance,...,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D
0,2007-01-01 04:29:32+00:00,China's Daqing pumps 43.41 mln tonnes of oil i...,3,1,"{'ENR', 'ASIA', 'CN', 'NGS', 'EMRG', 'RTRS', '...","{'Z', 'O', 'OIL'}",,"{'0857.HK', '0857.F', '0857.DE', 'PTR.N'}",PetroChina Co Ltd,0.235702,...,0,0,0,0,0,0,0,3,6,7
1,2007-01-01 07:03:35+00:00,"FEATURE-In kidnapping, finesse works best",3,1,"{'FEA', 'CA', 'LATAM', 'MX', 'INS', 'ASIA', 'I...","{'PGE', 'PCO', 'G', 'ESN', 'MD', 'PCU', 'DNP',...",FEATURE,{'STA.N'},Travelers Companies Inc,0.447214,...,1,1,1,1,1,1,1,3,3,3
2,2007-01-01 11:29:56+00:00,PRESS DIGEST - Wall Street Journal - Jan 1,3,1,"{'RET', 'ENR', 'ID', 'BG', 'US', 'PRESS', 'IQ'...","{'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD...",PRESS DIGEST,"{'WMT.DE', 'WMT.N'}",Wal-Mart Stores Inc,0.377964,...,0,0,0,0,0,0,0,5,11,17
3,2007-01-01 12:08:37+00:00,PRESS DIGEST - New York Times - Jan 1,3,1,"{'FUND', 'FIN', 'CA', 'SFWR', 'INS', 'PUB', 'B...","{'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD...",PRESS DIGEST,"{'GOOG.O', 'GOOG.OQ', 'GOOGa.DE'}",Google Inc,0.149071,...,0,0,0,0,0,0,0,5,13,15
4,2007-01-01 12:08:37+00:00,PRESS DIGEST - New York Times - Jan 1,3,1,"{'FUND', 'FIN', 'CA', 'SFWR', 'INS', 'PUB', 'B...","{'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD...",PRESS DIGEST,{'XMSR.O'},XM Satellite Radio Holdings Inc,0.149071,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#Market (Done)

#LabelEncode AssetName 
labelencoder_market_assetname = LabelEncoder()
market_train_df_cleaned.iloc[:, 2] = labelencoder_market_assetname.fit_transform(market_train_df_cleaned.iloc[:, 2])

#LabelEncode AssetCode 
labelencoder_market_assetcode = LabelEncoder()
market_train_df_cleaned.iloc[:, 1] = labelencoder_market_assetcode.fit_transform(market_train_df_cleaned.iloc[:, 1])


#Eliminate Unused Attributes 
# we have to remove 'open','close', 'universe'
market_train_df_usedCols = market_train_df_cleaned[['time',	'assetCode',	'assetName',	'volume','open','close',
                          'returnsClosePrevRaw1',	'returnsOpenPrevRaw1',
                          'returnsClosePrevRaw10',	'returnsClosePrevMktres10',
                          'returnsOpenPrevMktres10',	'returnsOpenNextMktres10',	'universe']]

#market_train_df_usedCols.drops

# free memmory 
#del market_train_df_cleaned

#change time format
market_train_df_usedCols['time'] = pd.to_datetime(market_train_df_usedCols.time)
market_train_df_usedCols['time'] = market_train_df_usedCols['time'].dt.strftime('%Y-%m-%d')

#to decode the target label
#ex: list(labelencoder_news_assetname.inverse_transform([6330]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
#News (Done)

#LabelEncode AssetName 
labelencoder_news_assetname = LabelEncoder()
news_train_df_cleaned.iloc[:, 8] = labelencoder_news_assetname.fit_transform(news_train_df_cleaned.iloc[:, 8])

#LabelEncode AssetCode 
labelencoder_news_assetcode = LabelEncoder()
news_train_df_cleaned.iloc[:, 7] = labelencoder_news_assetcode.fit_transform(news_train_df_cleaned.iloc[:, 7])

#LabelEncode Subjects 
labelencoder_news_subjects = LabelEncoder()
news_train_df_cleaned.iloc[:, 4] = labelencoder_news_subjects.fit_transform(news_train_df_cleaned.iloc[:, 4])

#LabelEncode Audiences 
labelencoder_news_audiences = LabelEncoder()
news_train_df_cleaned.iloc[:, 5] = labelencoder_news_audiences.fit_transform(news_train_df_cleaned.iloc[:, 5])

#Eliminate Unused Attributes 
news_train_df_usedCols = news_train_df_cleaned[['time',	'urgency',	'subjects'
                      ,	'audiences',		'assetCodes',	'assetName'
                      ,	'relevance',	'sentimentClass',	'sentimentNegative',	
                      'sentimentNeutral',	'sentimentPositive','noveltyCount12H','noveltyCount24H',
                      'noveltyCount3D',	'noveltyCount5D',	'noveltyCount7D',	'volumeCounts12H',
                      'volumeCounts24H',	'volumeCounts3D',	'volumeCounts5D',	'volumeCounts7D','volumeCounts7D']]

# free Memmory 
#del news_train_df_cleaned

#change time format
news_train_df_usedCols['time'] =pd.to_datetime(news_train_df_usedCols.time)
news_train_df_usedCols['time'] = news_train_df_usedCols['time'].dt.strftime('%Y-%m-%d')


#Get Number of Unique items in attribute
#ex: news_train_df_usedCols['subjects'].unique()

#to decode the target label
#ex: list(labelencoder_news_assetname.inverse_transform([6330]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
market_train_df_usedCols.to_csv("market.csv", sep='\t')

In [6]:
news_train_df_usedCols.to_csv("news.csv", sep='\t')

In [11]:
merged_dataframes_bydate = dk.dataframe.merge(news_train_df_usedCols, market_train_df_usedCols, left_on='time', right_on='time' )

AttributeError: module 'dask' has no attribute 'dataframe'

In [12]:
import dask.dataframe as dd

ImportError: cannot import name 'is_datetime64tz_dtype'

In [None]:
#Input: n , Output: 1
n_input = #no. of attributes
lr = 0.001
epochs = 1000
batch_size = 32


#Step 01: Spliting data (trainset, testset)
#x: all IV attributes after merging 
#y: DV (returnsOpenNextMktres10)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

#Step 02:Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#Step 03: Model
model = Sequential()

model.add(Dense(30, input_dim=n_input, activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(60, activation='relu'))
model.add(Dense(40, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='linear'))

#Step 04: Compile and Save
model.compile(loss='mse', optimizer=Adam (lr=lr))
model.fit(X, y, epochs=1000, verbose=0)
model.save("model.h5")

#Step 05: Prediction 
ynew = model.predict(Xnew)


In [None]:
print("Total Number of rows:", market_train_df_usedCols.shape[0] )
market_train_df_usedCols.isnull().sum()

In [69]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import gc
from sklearn.preprocessing import LabelEncoder

In [70]:
gc.enable()

In [71]:
news_train_df_cleaned = pd.read_csv("./datasets/news_sample.csv")
market_train_df_cleaned = pd.read_csv("./datasets/marketdata_sample.csv")

In [72]:
news_train_df_cleaned['time'] =pd.to_datetime(news_train_df_cleaned.time)
news_train_df_cleaned['time'] = news_train_df_cleaned['time'].dt.strftime('%Y-%m-%d')

market_train_df_cleaned['time'] =pd.to_datetime(market_train_df_cleaned.time)
market_train_df_cleaned['time'] = market_train_df_cleaned['time'].dt.strftime('%Y-%m-%d')

In [73]:
#Market
market_train_df_cleaned.head(5)

Unnamed: 0,time,assetCode,assetName,universe,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10
0,2007-02-01,A.N,Agilent Technologies Inc,1.0,2606900.0,32.19,32.17,0.005938,0.005312,,,-0.00186,0.000622,,,0.034672
1,2007-02-01,AAI.N,AirTran Holdings Inc,0.0,2051600.0,11.12,11.08,0.004517,-0.007168,,,-0.078708,-0.088066,,,0.027803
2,2007-02-01,AAP.N,Advance Auto Parts Inc,1.0,1164800.0,37.51,37.99,-0.011594,0.025648,,,0.014332,0.045405,,,0.024433
3,2007-02-01,AAPL.O,Apple Inc,1.0,23747329.0,84.74,86.23,-0.011548,0.016324,,,-0.048613,-0.037182,,,-0.007425
4,2007-02-01,ABB.N,ABB Ltd,1.0,1208600.0,18.02,18.01,0.011791,0.025043,,,0.012929,0.020397,,,-0.017994


In [74]:
news_train_df_cleaned.head(5)

Unnamed: 0,time,sourceTimestamp,firstCreated,sourceId,headline,urgency,takeSequence,provider,subjects,audiences,...,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D
0,2007-01-01,2007-01-01 04:29:32+00:00,2007-01-01 04:29:32+00:00,e58c6279551b85cf,China's Daqing pumps 43.41 mln tonnes of oil i...,3,1,RTRS,"{'CRU', 'CN', 'RTRS', 'ENR', 'LEN', 'EMRG', 'N...","{'O', 'Z', 'OIL'}",...,0,0,0,0,0,0,0,3,6,7
1,2007-01-01,2007-01-01 07:03:34+00:00,2007-01-01 07:03:34+00:00,5a31c4327427f63f,"FEATURE-In kidnapping, finesse works best",3,1,RTRS,"{'BD', 'INS', 'LATAM', 'CA', 'US', 'MX', 'IL',...","{'PGE', 'PCU', 'PCO', 'DNP', 'MD', 'E', 'G', '...",...,1,1,1,1,1,1,1,3,3,3
2,2007-01-01,2007-01-01 11:29:56+00:00,2007-01-01 11:29:56+00:00,1cefd27a40fabdfe,PRESS DIGEST - Wall Street Journal - Jan 1,3,1,RTRS,"{'IQ', 'RO', 'US', 'ID', 'RET', 'RTRS', 'ENR',...","{'M', 'PMF', 'DNP', 'PTD', 'T', 'E', 'D', 'PSC...",...,0,0,0,0,0,0,0,5,11,17
3,2007-01-01,2007-01-01 12:08:37+00:00,2007-01-01 12:08:37+00:00,23768af19dc69992,PRESS DIGEST - New York Times - Jan 1,3,1,RTRS,"{'PUB', 'BUS', 'INS', 'CA', 'ENT', 'US', 'FIN'...","{'M', 'PMF', 'DNP', 'PTD', 'T', 'E', 'D', 'PSC...",...,0,0,0,0,0,0,0,5,13,15
4,2007-01-01,2007-01-01 12:08:37+00:00,2007-01-01 12:08:37+00:00,23768af19dc69992,PRESS DIGEST - New York Times - Jan 1,3,1,RTRS,"{'PUB', 'BUS', 'INS', 'CA', 'ENT', 'US', 'FIN'...","{'M', 'PMF', 'DNP', 'PTD', 'T', 'E', 'D', 'PSC...",...,0,0,0,0,0,0,0,0,0,0


In [77]:
#Market (Done)

#LabelEncode AssetName 
labelencoder_market_assetname = LabelEncoder()
market_train_df_cleaned.iloc[:, 2] = labelencoder_market_assetname.fit_transform(market_train_df_cleaned.iloc[:, 2])

#LabelEncode AssetCode 
labelencoder_market_assetcode = LabelEncoder()
market_train_df_cleaned.iloc[:, 1] = labelencoder_market_assetcode.fit_transform(market_train_df_cleaned.iloc[:, 1])


#Eliminate Unused Attributes 
market_train_df_usedCols = market_train_df_cleaned [['time','assetCode','assetName','volume','open','close',
                          'returnsClosePrevRaw1','returnsOpenPrevRaw1',
                          'returnsClosePrevRaw10','returnsClosePrevMktres10',
                          'returnsOpenPrevMktres10','returnsOpenNextMktres10','universe']]


#change time format
market_train_df_usedCols['time'] =pd.to_datetime(market_train_df_usedCols.time)
market_train_df_usedCols['time'] = market_train_df_usedCols['time'].dt.strftime('%Y-%m-%d')


market_train_df_usedCols.head(5)

#to decode the target label
#ex: list(labelencoder_news_assetname.inverse_transform([6330]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,time,assetCode,assetName,volume,open,close,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe
0,2007-02-01,0,28,2606900.0,32.17,32.19,0.005938,0.005312,-0.00186,,,0.034672,1.0
1,2007-02-01,1,31,2051600.0,11.08,11.12,0.004517,-0.007168,-0.078708,,,0.027803,0.0
2,2007-02-01,2,18,1164800.0,37.99,37.51,-0.011594,0.025648,0.014332,,,0.024433,1.0
3,2007-02-01,3,74,23747329.0,86.23,84.74,-0.011548,0.016324,-0.048613,,,-0.007425,1.0
4,2007-02-01,4,3,1208600.0,18.01,18.02,0.011791,0.025043,0.012929,,,-0.017994,1.0


In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(market_train_df_usedCols.iloc[:,1:9]
                                                    , market_train_df_usedCols.iloc[:,11], test_size = 0.2, random_state = 0)

In [92]:
#PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = None)

In [93]:
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

In [97]:
explained_variance

array([1.00000000e+00, 2.88756771e-11, 2.68013671e-11, 2.35394495e-11,
       4.11406523e-15, 1.16211253e-16, 1.59337557e-17, 3.05099821e-18])

In [79]:
#News (Done)

#LabelEncode AssetName 
labelencoder_news_assetname = LabelEncoder()
news_train_df_cleaned.iloc[:, 8] = labelencoder_news_assetname.fit_transform(news_train_df_cleaned.iloc[:, 8])

#LabelEncode AssetCode 
labelencoder_news_assetcode = LabelEncoder()
news_train_df_cleaned.iloc[:, 7] = labelencoder_news_assetcode.fit_transform(news_train_df_cleaned.iloc[:, 7])

#LabelEncode Subjects 
labelencoder_news_subjects = LabelEncoder()
news_train_df_cleaned.iloc[:, 4] = labelencoder_news_subjects.fit_transform(news_train_df_cleaned.iloc[:, 4])

#LabelEncode Audiences 
labelencoder_news_audiences = LabelEncoder()
news_train_df_cleaned.iloc[:, 5] = labelencoder_news_audiences.fit_transform(news_train_df_cleaned.iloc[:, 5])

#Eliminate Unused Attributes 
news_train_df_usedCols = news_train_df_cleaned[['time','urgency','subjects'
                      ,'audiences','assetCodes','assetName'
                      ,'relevance','sentimentClass','sentimentNegative',
                      'sentimentNeutral','sentimentPositive','volumeCounts7D']]

#change time format
news_train_df_usedCols['time'] =pd.to_datetime(news_train_df_usedCols.time)
news_train_df_usedCols['time'] = news_train_df_usedCols['time'].dt.strftime('%Y-%m-%d')

news_train_df_cleaned.head(5)

#Get Number of Unique items in attribute
#ex: news_train_df_usedCols['subjects'].unique()

#to decode the target label
#ex: list(labelencoder_news_assetname.inverse_transform([6330]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,time,sourceTimestamp,firstCreated,sourceId,headline,urgency,takeSequence,provider,subjects,audiences,...,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D
0,2007-01-01,2007-01-01 04:29:32+00:00,2007-01-01 04:29:32+00:00,e58c6279551b85cf,5,1,1,3,10,"{'O', 'Z', 'OIL'}",...,0,0,0,0,0,0,0,3,6,7
1,2007-01-01,2007-01-01 07:03:34+00:00,2007-01-01 07:03:34+00:00,5a31c4327427f63f,8,1,1,3,6,"{'PGE', 'PCU', 'PCO', 'DNP', 'MD', 'E', 'G', '...",...,1,1,1,1,1,1,1,3,3,3
2,2007-01-01,2007-01-01 11:29:56+00:00,2007-01-01 11:29:56+00:00,1cefd27a40fabdfe,22,1,1,3,23,"{'M', 'PMF', 'DNP', 'PTD', 'T', 'E', 'D', 'PSC...",...,0,0,0,0,0,0,0,5,11,17
3,2007-01-01,2007-01-01 12:08:37+00:00,2007-01-01 12:08:37+00:00,23768af19dc69992,21,1,1,3,36,"{'M', 'PMF', 'DNP', 'PTD', 'T', 'E', 'D', 'PSC...",...,0,0,0,0,0,0,0,5,13,15
4,2007-01-01,2007-01-01 12:08:37+00:00,2007-01-01 12:08:37+00:00,23768af19dc69992,21,1,1,3,36,"{'M', 'PMF', 'DNP', 'PTD', 'T', 'E', 'D', 'PSC...",...,0,0,0,0,0,0,0,0,0,0


In [13]:
#table 1
raw_data = {
        'SameDay': ['1', '1', '1', '1'],
        'news': ["n0", "n1", "n2", "n3"]}
df_n = pd.DataFrame(raw_data, columns = [ 'SameDay','news'])
df_n

Unnamed: 0,SameDay,news
0,1,n0
1,1,n1
2,1,n2
3,1,n3


In [14]:
raw_data = {
        'SameDay': ['1', '1', '1', '1'],
        'stock': ["st0", "st1", "st2", "st3"]}
df_m = pd.DataFrame(raw_data, columns = [ 'SameDay','stock'])
df_m

Unnamed: 0,SameDay,stock
0,1,st0
1,1,st1
2,1,st2
3,1,st3


In [15]:
newdf = pd.merge(df_n, df_m,left_on='SameDay', right_on='SameDay')

In [16]:
newdf

Unnamed: 0,SameDay,news,stock
0,1,n0,st0
1,1,n0,st1
2,1,n0,st2
3,1,n0,st3
4,1,n1,st0
5,1,n1,st1
6,1,n1,st2
7,1,n1,st3
8,1,n2,st0
9,1,n2,st1


In [68]:
labelencoder_X_1 = LabelEncoder()
newdf.iloc[:, 2] = labelencoder_X_1.fit_transform(newdf.iloc[:, 2])
newdf

  if diff:


['st2', 'st2', 'st1']

In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [55]:
# Importing the dataset
dataset = pd.read_csv('./datasets/Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

In [56]:
X[0]

array([619, 'France', 'Female', 42, 2, 0.0, 1, 1, 1, 101348.88],
      dtype=object)

In [None]:
# Encoding categorical data
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])

In [60]:
X[0:60, 1]

array([0, 2, 0, 0, 2, 2, 0, 1, 0, 0, 0, 2, 0, 0, 2, 1, 1, 2, 2, 0, 0, 2,
       2, 0, 0, 0, 1, 0, 1, 0, 2, 0, 1, 2, 2, 0, 2, 2, 0, 1, 2, 0, 0, 0,
       2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 2, 0], dtype=object)

In [42]:
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
X[:, 2]

array([0, 0, 0, ..., 0, 1, 0], dtype=object)

In [45]:
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

array([0.0000000e+00, 0.0000000e+00, 6.1900000e+02, 0.0000000e+00,
       4.2000000e+01, 2.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0134888e+05])

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Part 2 - Now let's make the ANN!

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu', input_dim = 11))

# Adding the second hidden layer
classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu'))

# Adding the output layer
classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 10, nb_epoch = 100)

# Part 3 - Making the predictions and evaluating the model

# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)