In [16]:
import pandas
from dateutil.relativedelta import relativedelta
from pandas.tseries.offsets import DateOffset

In [17]:
def read_csv_data(file_path):
    return pandas.read_csv(file_path)

def merge_data(left_data, right_data, left_key, right_key):
    return left_data.merge(right_data[[right_key, 'Symbol']], left_on=left_key, right_on=right_key)

def drop_column(data, column_name):
    return data.drop([column_name], axis=1)

def convert_to_month_period(data, column_name):
    data[column_name] = pandas.to_datetime(data[column_name]).dt.to_period('M')
    return data

def get_stock_price_at_time(data, date, stock):
    for i in range(len(data)):
        if data['Date'][i] == date:
            return data[stock][i]
    return None

def get_stock_price_after_months(data, date, months, stock):
    for i in range(len(data)):
        if data['Date'][i] - months == date:
            return data[stock][i]
    return None

def calculate_price_change_percent(current_price, previous_price):
    return (previous_price - current_price) / current_price

def get_prices(snpFT, stockPrices):
    at_time_stock_price = []
    one_month_stock_price = []
    three_month_stock_price = []
    six_month_stock_price = []
    for i in range(len(snpFT)):
        date = snpFT['Date'][i]
        stock = snpFT['Symbol'][i]
        at_time_price = get_stock_price_at_time(stockPrices, date, stock)
        one_month_price = get_stock_price_after_months(stockPrices, date, 1, stock)
        three_month_price = get_stock_price_after_months(stockPrices, date, 3, stock)
        six_month_price = get_stock_price_after_months(stockPrices, date, 6, stock)
        at_time_stock_price.append(at_time_price)
        one_month_stock_price.append(one_month_price)
        three_month_stock_price.append(three_month_price)
        six_month_stock_price.append(six_month_price)
    snpFT['Stock Price'] = at_time_stock_price
    snpFT['1 Month SP'] = one_month_stock_price
    snpFT['3 Month SP'] = three_month_stock_price
    snpFT['6 Month SP'] = six_month_stock_price
    snpFT['1 Month SP%'] = (snpFT['1 Month SP'] - snpFT['Stock Price']) / snpFT['Stock Price']
    snpFT['3 Month SP%'] = (snpFT['3 Month SP'] - snpFT['Stock Price']) / snpFT['Stock Price']
    snpFT['6 Month SP%'] = (snpFT['6 Month SP'] - snpFT['Stock Price']) / snpFT['Stock Price']
    return snpFT

def article_type_classifier(snpFT):
    articleType = []
    for i in range(len(snpFT)):
        pos = snpFT['Pos'][i]
        neg = snpFT['Neg'][i]
        neutr = snpFT['Neutr'][i]
        if pos > 0.80:
            articleType.append('Positive') 
        elif neg > 0.80:
            articleType.append('Negative') 
        elif neutr > 0.80:
            articleType.append('Neutral') 
        else:
            articleType.append('Uncertain')
    snpFT['Article Type'] = articleType
    return snpFT

def average_percentage_change(snpFT):
    avg_per_change = []
    for i in range(len(snpFT)):
        onemoth = snpFT['1 Month SP%'][i]
        threemonth = snpFT['3 Month SP%'][i]
        sixmonth = snpFT['6 Month SP%'][i]
        if str(sixmonth) != 'nan':
            avg_per_change.append((onemoth)+(threemonth)+(sixmonth)/3)
        elif str(threemonth) != 'nan':
            avg_per_change.append((onemoth)+threemonth/2)
        else:
            avg_per_change.append(onemoth)
    snpFT['Average % Change'] = avg_per_change
    return snpFT

def outcome_classifier(snpFT):
    outcome = []
    for i in range(len(snpFT)):
        avg = snpFT['Average % Change'][i]
        if avg > 0.025:
            outcome.append('Positive')
        elif avg < -0.025:
            outcome.append('Negative')
        else:
            outcome.append('Neutral')
    snpFT['Outcome'] = outcome
    return snpFT

def accuracy_checker(snpFT):
    check = []
    for i in range(len(snpFT)):
        if (snpFT['Article Type'][i] == 'Uncertain') | (snpFT['Article Type'][i] == 'Neutral'): 
            check.append('Uncertain')
        elif snpFT['Article Type'][i] == snpFT['Outcome'][i]:
            check.append('Correct')
        else:
            check.append('Incorrect')
    snpFT['Check'] = check
    return snpFT

def extract_author_outcome(snpFT):
    historicalPerformance = []
    outcomes = []
    for i in range(len(snpFT)):
        authors = snpFT['Authors'][i]
        authors = authors.replace('[', '').replace(']', '').replace('"', '').replace('\'', '')
        authors = list(authors.split(','))
        check = snpFT['Check'][i]
        for author in list(authors):
            historicalPerformance.append(author.strip())
            outcomes.append(check)
    return pandas.DataFrame({'Author': historicalPerformance, 'Outcome': outcomes})

def create_author_profiles(profiles):
    correct = profiles[profiles['Outcome'] == 'Correct']
    incorrect = profiles[profiles['Outcome'] == 'Incorrect']
    correct = correct.groupby('Author').count().sort_values(by='Outcome', ascending=False)
    incorrect = incorrect.groupby('Author').count().sort_values(by='Outcome', ascending=False)
    profiles = correct.merge(incorrect, on='Author', how='outer').fillna(0).sort_values(by='Outcome_x', ascending=False).reset_index()
    profiles = profiles.rename(columns={'Outcome_x': 'Correct', 'Outcome_y': 'Incorrect'})
    profiles['Count'] = profiles['Correct'] + profiles['Incorrect']
    profiles['Correct%'] = profiles['Correct'] / profiles['Count']
    profiles = profiles[profiles['Count'] >= 10].sort_values(by='Correct%', ascending=False).head(50)
    return profiles

In [18]:
stockPrices = read_csv_data('/Users/benschlagman/Desktop/UCL Year 3/Final Year Project/SNP/SNP 500  - Sheet2.csv')
snpStocks = read_csv_data('/Users/benschlagman/Desktop/UCL Year 3/Final Year Project/SNP/SNP Companies.csv')
snpFT = read_csv_data('/Users/benschlagman/Desktop/UCL Year 3/Final Year Project/snp_nlp2.csv')
snpFT = merge_data(snpFT, snpStocks[['Name', 'Symbol']], "Stock", "Name")
snpFT = drop_column(snpFT, 'Name')
snpFT = snpFT[['Date','Link',  'Title + Subtitle',  'Stock',  'Symbol',  'Authors',  'Pos',  'Neg', 'Neutr']]
snpFT = convert_to_month_period(snpFT, 'Date')
snpStocks = convert_to_month_period(stockPrices, 'Date')
snpFT = get_prices(snpFT, stockPrices)
snpFT = article_type_classifier(snpFT)
snpFT = average_percentage_change(snpFT)
snpFT = outcome_classifier(snpFT)
snpFT = accuracy_checker(snpFT)
profiles = extract_author_outcome(snpFT)
profiles = create_author_profiles(profiles)
profiles.to_csv('profiles.csv')
snpFT.to_csv('MLDatabase.csv')

In [19]:
snpFT

Unnamed: 0,Date,Link,Title + Subtitle,Stock,Symbol,Authors,Pos,Neg,Neutr,Stock Price,1 Month SP,3 Month SP,6 Month SP,1 Month SP%,3 Month SP%,6 Month SP%,Article Type,Average % Change,Outcome,Check
0,2022-11,https://www.ft.com/content/e9098953-e031-4bc5-...,Were we wrong about big tech? Taking growth fo...,Apple Inc.,AAPL,['Robert Armstrong'],0.050251,0.058437,0.891312,155.25,145.96,,,-0.059839,,,Neutral,-0.059839,Negative,Uncertain
1,2022-11,https://www.ft.com/content/2e9cd061-99c3-4638-...,Apple warns of iPhone shipment delays in wake ...,Apple Inc.,AAPL,"['Ryan McMorrow', 'Nian Liu', 'Patrick McGee',...",0.010275,0.965538,0.024187,155.25,145.96,,,-0.059839,,,Negative,-0.059839,Negative,Correct
2,2022-11,https://www.ft.com/content/cc3b4a5a-af35-41d5-...,"China’s closed-loop crisis: ‘I’m human, not a ...",Apple Inc.,AAPL,"['Edward White', 'Qianer Liu']",0.044839,0.773122,0.182038,155.25,145.96,,,-0.059839,,,Uncertain,-0.059839,Negative,Uncertain
3,2022-10,https://www.ft.com/content/7df7443c-226b-455a-...,Workers flee Covid restrictions at China’s lar...,Apple Inc.,AAPL,"['Gloria Li', 'Ryan McMorrow']",0.029888,0.909527,0.060584,138.21,155.25,,,0.123291,,,Negative,0.123291,Positive,Incorrect
4,2022-10,https://www.ft.com/content/fa6bec83-058f-4991-...,Apple says it is facing ‘significant’ headwind...,Apple Inc.,AAPL,['Patrick McGee'],0.023468,0.954813,0.021720,138.21,155.25,,,0.123291,,,Negative,0.123291,Positive,Incorrect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6885,2016-07,https://www.ft.com/content/b7296c10-4ac0-11e6-...,"From rock stars to roadies, all change for Wal...",Netflix Inc.,NFLX,['Nicole Bullock'],0.036862,0.029603,0.933535,95.00,91.23,98.00,124.96,-0.039684,0.031579,0.315368,Neutral,0.097018,Positive,Uncertain
6886,2016-07,https://www.ft.com/content/bf22baf6-3fa4-11e6-...,Wall Street enjoys best week since late-2015 a...,Netflix Inc.,NFLX,['Gregory Meyer'],0.885330,0.041873,0.072797,95.00,91.23,98.00,124.96,-0.039684,0.031579,0.315368,Positive,0.097018,Positive,Correct
6887,2016-06,https://www.ft.com/content/bc52a558-36de-11e6-...,The office is dead! Long live the office! Tech...,Netflix Inc.,NFLX,['Alison Maitland'],0.039384,0.179470,0.781146,101.50,95.00,97.81,117.52,-0.064039,-0.036355,0.157833,Uncertain,-0.047783,Negative,Uncertain
6888,2016-06,https://www.ft.com/content/17856f62-360f-11e6-...,Pay transparency is the last taboo in business...,Netflix Inc.,NFLX,['Andrew Hill'],0.016312,0.760250,0.223438,101.50,95.00,97.81,117.52,-0.064039,-0.036355,0.157833,Uncertain,-0.047783,Negative,Uncertain
