In [2]:
import librosa
import Signal_Analysis.features.signal as sig
import numpy as np
import math
import os
import pickle
import re
import soundfile as sf
from googlesearch import search
import yfinance as yf
from dateutil import parser
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [3]:
def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start+len(needle))
        n -= 1
    return start

In [None]:
#compute dict of {date:[{company:{feature:value}}]}
data_master = {}
num_calls = 0
for root, dirs, files in os.walk("ReleasedDataset_mp3"):
    for directory in dirs:
        try:
            if (directory == 'Audio'):
                continue
            company_name = directory[:directory.index('_')]
            date = directory[directory.index('_')+1:]
            num_calls += 1
            print (num_calls)

            audio_list = []
            sr = 0
            #create feature map
            for root, dirs, files in os.walk("ReleasedDataset_mp3/"+directory+'/Audio'):
                

                files_sorted = {}
                for file in files:
                    first_val = file[file.index('_')+1:find_nth(file, '_', 2)]
                    second_val = file[find_nth(file, '_', 2)+1: find_nth(file, '.', 2)]
                    first_val = int(re.sub("[^0-9]", "", first_val))
                    second_val = int(re.sub("[^0-9]", "", second_val))
                    file_index = (first_val, second_val)
                    files_sorted[file_index] = file

                for file_key in sorted(files_sorted.keys()):
                    file = files_sorted[file_key]
                    audio_path = "ReleasedDataset_mp3/"+directory+'/Audio/' + file
                    signal, sr = librosa.load(audio_path)
                    audio_list.append(signal)
            audio_list = np.concatenate(audio_list)
            sf.write('audio_list.wav', audio_list, sr)
            data, sr = sf.read('audio_list.wav')

            audio_features = compute_audio_features(data, sr)
            company_dict = {company_name:audio_features}




            if (date not in data_master.keys()):
                data_master[date] = [company_dict]
            else:
                data_master[date].append(company_dict)
            with open('data_master.pickle', 'wb') as handle:
                pickle.dump(data_master, handle, protocol=pickle.HIGHEST_PROTOCOL)
        except:
            continue

In [None]:
data_master

In [4]:
import json
with open('master_features.txt', 'r') as file:
    data_master = json.load(file)

In [5]:
data_master

{'20170803': [{'Becton Dickinson': {'minimum_pitch': 216.75435,
    'maximum_pitch': 3595.1587,
    'median_f0': [[103.16901408450703]],
    'number_of_pulses': 407,
    'jitter_local': 0.050096107778580694,
    'jitter_local_absolute': 0.00042484750853803553,
    'jitter_rap': 0.026665996818046236,
    'jitter_ppq5': 0.028850062408851376,
    'jitter_ddp': 0.0799979904541387,
    'max_db': 58.66719853886681,
    'min_db': 20.09671816966296,
    'mean_db': 35.013663136723174,
    'HNR': 6.097161801484113,
    'VADER_neg': 0.021,
    'VADER_neu': 0.844,
    'VADER_pos': 0.135,
    'VADER_compound': 0.9999,
    'finBERT_pos': 0.5864349007606506,
    'finBERT_neg': -0.3480337858200073,
    'finBERT_neu': -0.7337027192115784}},
  {'Ball Corp': {'minimum_pitch': 154.0751,
    'maximum_pitch': 3318.2476,
    'median_f0': [[117.5133689839572]],
    'number_of_pulses': 755,
    'jitter_local': 0.05932506437639114,
    'jitter_local_absolute': 0.0004182898411798934,
    'jitter_rap': 0.03077454

In [6]:
def compute_audio_features(audio_list, sr):
    features_dict = {}
    
    F_0 = sig.get_F_0(signal, sr)
    jitter= sig.get_Jitter(signal, sr)
    pulses = sig.get_Pulses(signal, sr)

    #Pitch analysis
    pitches, magnitudes = librosa.core.piptrack(y=signal, sr=sr)
    pitches_to_use = []
    for t in range (magnitudes.shape[1]):
        index = magnitudes[:, t].argmax()
        pitch = pitches[index, t]
        pitches_to_use.append(pitch)

    minimum_pitch = np.amin(pitches_to_use)
    features_dict ['minimum_pitch'] = minimum_pitch

    maximum_pitch = np.amax(pitches_to_use)
    features_dict ['maximum_pitch'] = maximum_pitch

    median_F0 = F_0
    features_dict ['median_f0'] = [median_F0]

    number_of_pulses = len(pulses)
    features_dict ['number_of_pulses'] = number_of_pulses
        

    #Voice analysis
        
    jitter_local = jitter['local']
    features_dict ['jitter_local'] = jitter_local
        
    jitter_local_absolute = jitter['local, absolute']
    features_dict ['jitter_local_absolute'] = jitter_local_absolute
        
    jitter_rap = jitter['rap']
    features_dict ['jitter_rap'] = jitter_rap
        
    jitter_ppq5 = jitter['ppq5']
    features_dict ['jitter_ppq5'] = jitter_ppq5
        
    jitter_ddp = jitter['ddp']
    features_dict ['jitter_ddp'] = jitter_ddp
      

    #intensity analysis
    numchunks = 100
    chunks = np.array_split(signal, numchunks)
    dbs = [-20*math.log10( math.sqrt(np.mean(chunk**2)) ) for chunk in chunks]

    mean_db = 0
    count = 0
    min_db = float('inf')
    max_db = float('-inf')
    for db in dbs:
        count += 1
        mean_db += db
        if (db > max_db):
            max_db = db
        elif (db < min_db):
            min_db = db
    mean_db = mean_db/count
    features_dict ['max_db'] = max_db
    features_dict ['min_db'] = min_db
    
    features_dict ['mean_db'] = mean_db

    #harmonicity analysis
    HNR = sig.get_HNR (signal, sr)
    features_dict ['HNR'] = HNR
    
#     features_dict['median_f0'] = np.median(features_dict['median_f0'])
    #features_dict['mean_db'] = np.mean(features_dict['mean_db'])
    return features_dict

In [13]:
#get tickers
#compute dict of {date:[{company:{feature:value}}]}
company_names_to_tickers = {}
num_calls = 0
for root, dirs, files in os.walk("ReleasedDataset_mp3"):
    for directory in dirs:
        try:
            if (directory == 'Audio'):
                continue
            company_name = directory[:directory.index('_')]
            date = directory[directory.index('_')+1:]
            ticker = name_convert(company_name)
            company_names_to_tickers [company_name] = ticker
        except:
            continue

In [14]:
company_names_to_tickers

{'Becton Dickinson': 'BDX',
 'CIGNA Corp.': 'CI',
 'The Bank of New York Mellon Corp.': 'bk',
 'JPMorgan Chase & Co.': 'JPM',
 'Martin Marietta Materials': 'MLM',
 'Advanced Micro Devices Inc': 'AMD',
 'Xcel Energy Inc': 'XEL',
 'Electronic Arts': 'EA',
 'NiSource Inc.': 'NI',
 'Salesforce.com': 'CRM',
 'Lilly (Eli) & Co.': 'LLY',
 'Synopsys Inc.': 'SNPS',
 'Illumina Inc': 'ILMN',
 'Cboe Global Markets': 'CBOE',
 'Gilead Sciences': 'GILD',
 'Polo Ralph Lauren Corp.': 'RL',
 'Waste Management Inc.': 'WM',
 'Emerson Electric Company': 'EMR',
 'Avery Dennison Corp': 'AVY',
 'News Corp. Class A': 'NWS',
 'Ecolab Inc.': 'ECL',
 'Alliance Data Systems': 'ADS',
 'Alaska Air Group Inc': 'ALK',
 'CenturyLink Inc': 'LUMN',
 'Intuit Inc.': 'INTU',
 'Kraft Heinz Co': 'KHC',
 'Comcast Corp.': 'CMCSA',
 'Illinois Tool Works': 'ITW',
 'Fortive Corp': 'FTV',
 'PG&E Corp.': 'PG',
 'Molson Coors Brewing Company': 'TAP',
 'Mattel Inc.': 'MAT',
 'Caterpillar Inc.': 'CAT',
 'Sealed Air': 'SEE',
 'Tiffany &

In [12]:
def name_convert(self):

    searchval = 'yahoo finance '+self
    link = []
    #limits to the first link
    for url in search(searchval, lang='en', num_results=1):
        link.append(url)

    link = str(link[0])
    link=link.split("/")
    if link[-1]=='':
        ticker=link[-2]
    else:
        x=link[-1].split('=')
        ticker=x[-1]

    return(ticker)

In [15]:
#manually change some
company_names_to_tickers['Alliance Data Systems'] = 'ADS'
company_names_to_tickers['Tiffany & Co.'] = 'CELG'
company_names_to_tickers['SunTrust Banks'] = 'STI'
company_names_to_tickers['General Growth Properties Inc.'] = 'GGP'
company_names_to_tickers['AT&T Inc.'] = 'T'
company_names_to_tickers['Noble Energy Inc'] = 'NBLX'
company_names_to_tickers.pop('.ipynb', None)

In [16]:
len(company_names_to_tickers.keys())

274

In [17]:
ticker_string = ''
for company_name in company_names_to_tickers.keys():
    ticker = company_names_to_tickers[company_name]
    ticker_string += ticker + ' '
ticker_string = ticker_string[:len(ticker_string)-1]
price_data = yf.download(ticker_string, start="2017-01-01", end="2017-12-31")

[*********************100%***********************]  275 of 275 completed

11 Failed downloads:
- GGP: No data found for this date range, symbol may be delisted
- CELG: No data found, symbol may be delisted
- +: No data found, symbol may be delisted
- STI: No data found, symbol may be delisted
- KORS: No data found for this date range, symbol may be delisted
- RHI.F: No data found, symbol may be delisted
- CBS: No data found, symbol may be delisted
- HISTORY: No data found, symbol may be delisted
- AGN: No data found, symbol may be delisted
- XL: Data doesn't exist for startDate = 1483257600, endDate = 1514707200
- UTX: No data found, symbol may be delisted


In [18]:
#drop holidays
holidays = ['January 2, 2017', 'January 16, 2017', 'February 20, 2017', 'April 14, 2017', 'May 29, 2017', 
            'July 3, 2017', 'July 4, 2017', 'September 4, 2017', 'November 23, 2017', 'November 24, 2017', 
            'December 25, 2017']
parsed_holidays = [parser.parse(holiday) for holiday in holidays]   
drop_indices = []
for index in price_data.index:
    if (index in parsed_holidays):
        drop_indices.append(index)

In [19]:
price_data = price_data.drop(index = drop_indices)
price_data = price_data.dropna(axis = 1, how = 'all')

In [23]:
look_ahead_periods = [1, 3, 7, 14, 21, 28]
returns_data_master = {}
for date in list(data_master.keys()):
    try:
        returns_data_master[date] = []
        
        parsed_date = parser.parse(date)
        buy_date_index = price_data.index.get_loc(parsed_date) + 1
        buy_date = price_data.index[buy_date_index]
        company_list = data_master[date]
        for entry in company_list:
            company_name = list(entry.keys())[0]
            
            ticker = company_names_to_tickers[company_name]

            buy_price = price_data['Open'][ticker][buy_date]
            sell_prices = []

            for look_ahead in look_ahead_periods:
                sell_date_index = buy_date_index + look_ahead
                sell_date = price_data.index[sell_date_index]
                sell_price = price_data['Open'][ticker][sell_date]
                sell_prices.append(sell_price)

            returns = []
            for sell_price in sell_prices:
                returns.append((sell_price- buy_price)/buy_price)

            returns_data_master[date].append({company_name:returns})
    except:
        continue
    
            
            
    

In [24]:
combined_data = {}
for date in sorted(returns_data_master.keys()):
    company_list = returns_data_master[date]
    combined_data[date] = []
    for entry in company_list:
        company_name = list(entry.keys())[0]
        company_return = entry[company_name]
        features_list = data_master[date]
        for feature_entry in features_list:
            feature_company_name = list(feature_entry.keys())[0]
            if (feature_company_name == company_name):
                features = list(feature_entry[feature_company_name].values())
                combined_data[date].append([features, company_return])

In [25]:
for key in combined_data.keys():
    entry = combined_data[key]
    for pair in entry:
        features = pair[0]
        features[2] = features[2][0][0]

In [26]:
df_final = pd.DataFrame(columns = ['minimum_pitch','maximum_pitch','median_f0', 'number_of_pulses', 'jitter_local', 
                            'jitter_local_absolute', 'jitter_rap', 'jitter_ppq5', 'jitter_ddp', 'max_db', 
                            'min_db', 'mean_db', 'HNR', 'VADER_neg','VADER_neu','VADER_pos','VADER_compound',
                            'finBERT_pos', 'finBERT_neg','finBERT_neu','1_day_ret','3_day_ret', '7_day_ret', '14_day_ret', 
                            '21_day_ret', '28_day_ret'])

In [27]:
df_final = []
for date in sorted(combined_data.keys()):
    entry_list = combined_data[date]
    for entry in entry_list:
        features = entry[0]
        rets = entry[1]
        row = list(np.concatenate([features, rets]))
        df_final.append(row)

In [28]:
df_final = pd.DataFrame(df_final, columns = ['minimum_pitch','maximum_pitch','median_f0', 'number_of_pulses', 'jitter_local', 
                            'jitter_local_absolute', 'jitter_rap', 'jitter_ppq5', 'jitter_ddp', 'max_db', 
                            'min_db', 'mean_db', 'HNR', 'VADER_neg','VADER_neu','VADER_pos','VADER_compound',
                            'finBERT_pos', 'finBERT_neg','finBERT_neu','1_day_ret','3_day_ret', '7_day_ret', '14_day_ret', 
                            '21_day_ret', '28_day_ret'])

In [None]:
target_periods = ['1_day_ret','3_day_ret', '7_day_ret', '14_day_ret', 
                            '21_day_ret', '28_day_ret']
for target_period in target_periods:
    df_final[target_period] = df_final[target_period].apply(lambda x: x >0)

In [29]:
df_final = df_final.dropna(axis = 0, how = 'any')

In [32]:
pd.to_pickle(df_final, "df_final.pkl")

In [None]:
#correlation visualization
sns.heatmap(df_final.corr())

In [None]:
#random forest
train_idx = int(df_final.shape[0]*.8)
train_data = df_final[:train_idx]
test_data = df_final[train_idx:]

In [None]:
#eval baseline
features = train_data[['jitter_local_absolute']]

feature_list = list(features.columns)
features = np.array(features)

target_periods = ['1_day_ret','3_day_ret', '7_day_ret', '14_day_ret', 
                            '21_day_ret', '28_day_ret']

test_features =  test_data[['jitter_local_absolute']]

test_features = np.array(test_features)

for target_period in target_periods:
    print ('target period: ')
    print (target_period)
    print ()
    #fit
    
    target = np.array(train_data[target_period])
    
    rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
    rf.fit(features, target)
    
    #predict on test
    test_target = np.array(test_data[target_period])
    predictions = rf.predict(test_features)
    
    f1 = f1_score(predictions, test_target)
    accuracy = accuracy_score (predictions, test_target)
    print ('f1: ')
    print (f1)
    print()
    
    print ('accuracy: ')
    print (accuracy)
    print()
    

In [None]:
#eval full model
features = train_data[['minimum_pitch','maximum_pitch','median_f0', 'number_of_pulses', 'jitter_local', 
                            'jitter_local_absolute', 'jitter_rap', 'jitter_ppq5', 'jitter_ddp', 'max_db', 
                            'min_db', 'mean_db', 'HNR']]

feature_list = list(features.columns)
features = np.array(features)

target_periods = ['1_day_ret','3_day_ret', '7_day_ret', '14_day_ret', 
                            '21_day_ret', '28_day_ret']

test_features =  test_data[['minimum_pitch','maximum_pitch','median_f0', 'number_of_pulses', 'jitter_local', 
                            'jitter_local_absolute', 'jitter_rap', 'jitter_ppq5', 'jitter_ddp', 'max_db', 
                            'min_db', 'mean_db', 'HNR']]

test_features = np.array(test_features)

for target_period in target_periods:
    print ('target period: ')
    print (target_period)
    print ()
    #fit
    
    target = np.array(train_data[target_period])
    
    rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
    rf.fit(features, target)
    
    #predict on test
    test_target = np.array(test_data[target_period])
    predictions = rf.predict(test_features)
    
    f1 = f1_score(predictions, test_target)
    accuracy = accuracy_score (predictions, test_target)
    print ('f1: ')
    print (f1)
    print()
    
    print ('accuracy: ')
    print (accuracy)
    print()
    
    print ('feature importances: ')
    #feature importances
    importances = rf.feature_importances_
    plt.rcParams["figure.figsize"] = (40, 20)
    plt.bar(feature_list, importances)
    plt.show()
    
    