In [1]:
import pandas as pd
import datetime
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pytz
import requests
import json
import pprint
import os
import dateutil
import time

In [2]:
# historical data
import calendar

def next_weekday(d, weekday):
    days_ahead = weekday - d.weekday()
    if days_ahead < 0: # Target day already happened this week
        days_ahead += 7
    return d + datetime.timedelta(days_ahead)

def bucketed(df, start_on="Sunday"):
    df['day_of_week'] = pd.to_datetime(df['created_at']).dt.day_name()
    
    df['created_at'] = pd.to_datetime(df['created_at']).dt.date
    min_date = df["created_at"].min()
    min_date = next_weekday(min_date, list(calendar.day_name).index(start_on))
    max_date = df["created_at"].max()
    tweet_counts = df.groupby('created_at').agg('count')["text"]
    dates = pd.date_range(min_date, max_date, freq='D')
    counts = pd.DataFrame({ "count": tweet_counts},index=dates).fillna(0)
    counts = counts.resample('7D').sum()
    return counts.drop(counts.tail(1).index) # drop last row in case its a count over less than the full time bucket

In [3]:
ACCOUNT_BALANCE = 102
N_TWEETS = 0

BEARER_TOKEN="AAAAAAAAAAAAAAAAAAAAAAXT9gAAAAAAoITLBCf%2B2K7BMSqakqcbsHUSLrk%3DLz95o8CkkhjOTthpcyEEg6BdNav0zphRcrEYdeG4GXXV3Qkftk"

## value functions
def expected_value(potential_win, chance_win, potential_loss, chance_loss):
    return (potential_win * chance_win) - (potential_loss * chance_loss)

def allocation(account_balance, expected_value):
    pct_alloc = min(( expected_value * 5 ) / 10, .03)
    alloc = account_balance * pct_alloc
    #risk_coef = 1 - (1 / (proba * 100) )
    #risk_adjusted = alloc * risk_coef
    #return risk_adjusted
    return alloc

def recommended_shares(account_balance, expected_value, price_per_share):
    return allocation(account_balance, expected_value) / price_per_share;

def to_proba(buckets, categories=None):
    vals = buckets.value_counts()
    # [ (range(0,2), "0-2"), range(3-5), "3-5" ]
    #for c in categories:
    #    rnge = c[0]
    #    id_str = c[1]
    #    for r in range:
            
    s = vals.sum()
    return vals/s

## portfolio management
def shares_bought(c, yes_or_no, positions):
    bought = 0
    if c in positions and yes_or_no in positions[c]:
        for pos in positions[c][yes_or_no]:
            bought += pos[1]
    return bought

def recommendation_buy(contract, yes_or_no, account_balance, expected_value, price_per_share, positions):
    shares = recommended_shares(account_balance, expected_value, price_per_share) - shares_bought(contract, yes_or_no, positions)
    shares = int(round(shares))
    if shares > 0:
        print("BUY {yn} shares for contract {n}: {shares} shares @{price} (EV: {ev}, TOTAL: {t})".format(n=contract,shares=shares, price=price_per_share, ev=expected_value, yn=yes_or_no.upper(), t=shares*price_per_share))

def recommendation_sell(contract, yes_or_no, expected_value, price_per_share, n_shares, bought_at):
    print("SELL {yn} shares for contract {n}_{bought_at}_{n_shares}: ALL shares @{price} (EV: {ev}, TOTAL: {t})".format(n=contract, price=price_per_share, ev=expected_value, yn=yes_or_no.upper(), t=n_shares*price_per_share, bought_at=bought_at, n_shares=n_shares))

## market evaluation
def fetch_market_data(market_id):
    url = "https://www.predictit.org/api/marketdata/markets/{id}".format(id=market_id)
    r = requests.get(url=url)
    return r.json()

In [4]:
def get_twitter_user_timeline(screen_name, max_id=None, since_id=None):
    url = "https://api.twitter.com/1.1/statuses/user_timeline.json"
    headers = { "Authorization": "Bearer {t}".format(t=BEARER_TOKEN)}
    params = {
        "count": "200",
        "trim_user": "true",
        "screen_name": screen_name
    }
    if max_id: 
        params["max_id"] = max_id
    if since_id:
        params["since_id"] = since_id
        
    r = requests.get(url=url,headers=headers, params=params)
    raw = r.json()
    transformed = json.dumps([ { "id": tweet["id"], "created_at": tweet["created_at"], "text": tweet["text"] } for tweet in raw])
    return pd.read_json(transformed, orient="records")

def get_recent_tweets(screen_name, from_date=None):
    df = get_twitter_user_timeline(screen_name)
    df["created_at"] = df["created_at"].dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
    if from_date:
        df = df[df["created_at"] > from_date]
    return df
    
# the twitter api returns different results for the same request...
def _get_twitter_history(screen_name, max_id=None):
    get_next = True
    df = pd.DataFrame(columns=["id","created_at", "text"])
    while get_next:
        tweets = get_twitter_user_timeline(screen_name, max_id)
        print(len(tweets.index))
        if len(tweets.index) > 0:
            df = tweets if df.empty else pd.concat([df, tweets], axis=0)
            last_row = tweets.tail(1).iloc[0]
            max_id = last_row["id"] - 1
        else:
            get_next = False
    return df

def get_twitter_history(screen_name, cache=True):
    fname = "data/tweets/{sn}.csv".format(sn=screen_name)
    max_id = None
    if cache and os.path.isfile(fname):
        df = pd.read_csv(fname)
        max_id = int(df.tail(1).iloc[0]["id"]) -1
    df = _get_twitter_history(screen_name, max_id);
    if not os.path.isdir("data/tweets"):
        os.mkdir("data/tweets")
    if len(df) > 0:
        df.to_csv(fname, mode='a')

def fetch_full_trump_tweet_history(rnge, cache=True):
    fname = "data/tweets/@realDonaldTrump.csv"
    df = None
    for year in rnge:
        url = None
        if year == 2019:
            url = "http://www.trumptwitterarchive.com/data/realdonaldtrump/2019.json"
        else:
            url = "http://d5nxcu7vtzvay.cloudfront.net/data/realdonaldtrump/{y}.json".format(y=str(year))
        _df  = pd.read_json(url)
        if df is None:
            df = _df
        else:
            df = pd.concat([df,_df])
        time.sleep(1)
     
    if not os.path.isdir("data/tweets"):
        os.mkdir("data/tweets")
    if len(df) > 0:
        df.to_csv(fname, mode='w')

#"homieng6@gmail.com"
#"??"
#"@homiesaccount"
#"nY7VUVqcxJ4vmcX"
#"AAAAAAAAAAAAAAAAAAAAAAXT9gAAAAAAoITLBCf%2B2K7BMSqakqcbsHUSLrk%3DLz95o8CkkhjOTthpcyEEg6BdNav0zphRcrEYdeG4GXXV3Qkft"

In [5]:
def plot_tweet_distributions_per_day(source_df):
    df = pd.DataFrame(columns=["proba","day"])
    df.index.name = "n_tweets"
    for x in range(0,7,1):
        weekday = calendar.day_name[x]
        b = bucketed(source_df, start_on=weekday)
        proba = b['count']/b['count'].sum()
        _df = pd.DataFrame({ "proba": proba.values, "day": x }, index=proba.index)
        df = pd.concat([df, _df])
        df["n_tweets"] = df.index

    fig, ax = plt.subplots()
    for key, _grp in df.groupby(['n_tweets']):
        grp = _grp.sort_values(by="day", ascending=False)
        ax = grp.plot(ax=ax, kind='line', x="day", y='proba', label=str(grp["n_tweets"].iloc[0]))

    plt.legend(loc='best')
    plt.show()
    
#_df = pd.read_csv('./data/fake_news_tweets.csv')
#plot_tweet_distributions_per_day(_df)

In [6]:
def show_twitter_market_research(csv_path):
    df = pd.read_csv(csv_path)
    
    # number of tweets per week
    b=bucketed(df)
    b.plot(title="Tweets per Week")
    plt.show()
    
    # distribution of tweets per week
    vals = b["count"].value_counts()
    bins = vals.size
    b["count"].plot(kind="hist",bins=bins, title="Tweets per Week Distribution")
    plt.show()
    
    # freq of tweets per day
    df['day_of_week'] = pd.to_datetime(df['created_at']).dt.day_name()
    
    weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    df['day_of_week'].value_counts().reindex(weekdays).plot(kind='bar', title="Tweets per Calendar Day")
    plt.show()

In [7]:
def to_range_str(range):
    return str(range.start) + "-" + str(range.stop-1)

def append_count(series, count, category_range):
    return series.append(pd.Series([ count ], index=[ to_range_str(category_range) ]))

# takes dataframe with tweet counts bucketed per n days
# returns a data frame that returns counts for a category based, excluding coun
# this answers: what is the probability that we end in a category, given that we have already seen curr_n values
def count_adjusted(df, categories, curr_n):
    grouped = pd.Series()
    for rnge in categories:
        adjusted_range = range(max(rnge.start-curr_n, 0), max(rnge.stop-curr_n, 0 ))
        count = df[df["count"].between(adjusted_range.start, adjusted_range.stop-1)].shape[0]
        grouped = append_count(grouped, count, rnge)
    return grouped

In [8]:
def eval_twitter_market(market, path, positions, show_market_research=False):
    if show_market_research:
        show_twitter_market_research(path)
        
    data = fetch_market_data(market["id"])
    contracts = data["contracts"]
    print(data["shortName"])
    
    end_date_str = contracts[0]["dateEnd"]
    end_date = parser.parse(end_date_str)
    start_date = end_date - datetime.timedelta(days=7)
    n_days = days_left(end_date)
    print("Days left:", n_days)
    
    timezone = pytz.timezone("US/Eastern")
    from_date = timezone.localize(start_date)
    recent = get_recent_tweets(market["twitter_handle"], from_date=from_date)
    if ( "filter" in market.keys() ):
        recent = recent[recent["text"].str.contains(market["filter"],case=False)]
        #n_matching_tweets = len(recent[recent["text"].str.contains("fake news|fakenews",case=False)])
    n_matching_tweets = len(recent)
    print("Matching tweets:", n_matching_tweets)

    df = pd.read_csv(path)
    
    df['day_of_week'] = pd.to_datetime(df['created_at']).dt.day_name()
    weekdays = calendar.day_name
    timezone = pytz.timezone("US/Eastern")
    from_date = timezone.localize(datetime.datetime.now())
    circular_weekdays = np.tile(weekdays, 2)
    idx = np.where(circular_weekdays == from_date.strftime("%A"))[0][0]
    weekdays_left = circular_weekdays[idx:idx+n_days]

    contract_map = market["contract_map"]
    
    df = df[df["day_of_week"].isin(weekdays_left)]   
    b=bucketed(df, start_on=weekdays[idx])
    c=count_adjusted(b, [x[1] for x in contract_map], n_matching_tweets )
    proba = c/c.sum()
    print("Category probabilities:")
    pprint.pprint(proba)
    
    for c in contracts: 
        #print("Contract", c["name"])
        category = [to_range_str(x[1]) for x in contract_map if x[0] == str(c["id"])][0]
        expected_values = eval_trade_variations(c, proba, category, positions)
        #print(json.dumps(expected_values, indent=4))
        for k, v in expected_values.items():
            ev = v[0]
            price = v[1]
            if ev > 0:
                yes_or_no = "yes" if "yes" in k else "no"
                if "buy" in k:
                    recommendation_buy(category, yes_or_no, ACCOUNT_BALANCE, ev, price, positions)
                else:
                    # is a sell
                    p = k.split('_')
                    bought_at = float(p[2])
                    quantity = float(p[3])
                    recommendation_sell(category, yes_or_no, ev, price, quantity, bought_at)
                    
    outcomes(positions, [to_range_str(c[1]) for c in market["contract_map"]])

def eval_trade_variations(contract, proba, category, positions):
    proba_yes = proba[category]
    
    buy_yes = contract["bestBuyYesCost"]
    buy_no = contract["bestBuyNoCost"]
    sell_yes = contract["bestSellYesCost"]
    sell_no = contract["bestSellNoCost"]
    
    if buy_yes and buy_no and 1 - buy_yes - buy_no > 0:
            print("Arbitrage opportunity BUY:", category, "contract, ", buy_yes, buy_no)
    
    d = {}
    
    if buy_yes:
        d["buy_yes"] = (expected_value(1-buy_yes, proba_yes, buy_yes, 1-proba_yes), buy_yes)
        
    if buy_no:
        d["buy_no"] = (expected_value(1-buy_no, 1-proba_yes, buy_no, proba_yes), buy_no)

    if category in positions and "yes" in positions[category]:
        yes_positions = positions[category]["yes"]
        if not sell_yes:
            # if there are no buyers on market, calculate EV of a sell at 99 cents so we may determine if we should list at all
            sell_yes = .99
        for pos in yes_positions:
            strike_price = pos[0]
            quantity = pos[1]
            ev = (sell_yes - strike_price) - expected_value(1-strike_price, proba_yes, strike_price, 1-proba_yes)
            key = "sell_yes_"+str(strike_price)+"_"+str(quantity)
            d[key] = (ev, sell_yes)
            
    if category in positions and "no" in positions[category]:
        no_positions = positions[category]["no"]
        if not sell_no:
            # if there are no buyers on market, calculate EV of a sell at 99 cents so we may determine if we should list at all
            sell_no = .99
        for pos in no_positions:
            strike_price = pos[0]
            quantity = pos[1]
            ev = (sell_no - strike_price) - expected_value(1-strike_price, 1-proba_yes, strike_price, proba_yes)
            key = "sell_no_"+str(strike_price)+"_"+str(quantity)
            d[key] = (ev, sell_no)
    return d

def days_left(end_date):
    start_date = end_date - datetime.timedelta(days=7)
    delta = datetime.datetime.now() - start_date
    days_left = ((7*24) - (delta.total_seconds()/3600))/24
    return max(round(days_left),1)

def outcomes(positions, categories):
    for c in categories:
        total = 0
        for pp in positions:
            if pp == c:
                if "yes" in positions[pp]:
                    for x in positions[pp]["yes"]:
                        total += (1 - x[0])*x[1]
                if "no" in positions[pp]:
                    for x in positions[pp]["no"]:
                        total -= x[0]*x[1]
            else:
                if "yes" in positions[pp]:
                    for x in positions[pp]["yes"]:
                        total -= x[0]*x[1]
                if "no" in positions[pp]:
                    for x in positions[pp]["no"]:
                        total += (1-x[0])*x[1]
        print(c, total)

In [9]:
# scale EV by risk for final quantity recommendations (to reduce volatility)
# take expected tweets for day of week into account given some people dont tweet much on weekends
# graph of tweet density per time per day

In [10]:
markets = [
    { 
        "id": 5410, 
        "twitter_handle": "@vp", 
        "contract_map": [ ("15001", range(0, 25)), ("15004",range(25, 30)), ("15006", range(30, 35)), ("15002", range(35, 40)), ("15007",range(40, 45)), ("15005", range(45, 50)), ("15003",range(50, 100))],
        "positions":{ 
            "0-24": {
                "yes": [(.05, 22), (.02,30)]
            },
            "25-29": {
                "no": [(.88, 1)]
            },
            "30-34": {
                "no": [(.83, 1), (.80, 3)]
            },
            "35-39": {
                "no": [(.82, 2), (.80,2)]
            },
            "40-44": {
                "no": [(.81, 4)]
            },
            "45-49": {
                "yes": [(.23,12), (.22,2), (.21,1), (.17,3), (.16,1)]
            },
            "50-99": {
                "yes": [(.22, 14),(.20, 1), (.23,5), (.22,40)]
            }
        }
    },
    { 
        "id": 5407, 
        "twitter_handle": "@whitehouse", 
        "contract_map": [ ("14983", range(0, 80)), ("14985",range(80, 85)), ("14984", range(85, 90)), ("14986", range(90, 95)), ("14987",range(95, 100)), ("14988", range(100, 105)), ("14989",range(105, 300))],
        "positions": { 
            "0-79": {
                "yes": [(.16, 19),(.1,12), (.06,20), (.05,10), (.04,16)]
            },
            "80-84": {
                "yes": [(.07, 25), (.08,13), (.07, 6), (.03,20)]
            },
            "85-89": {
                "yes": [(.09, 15), (.08, 23)]
            },
            "90-94": {
                "yes": [(.10, 18)]
            },
            "95-99": {
                "yes": [(.10, 20), (.08, 18), (.07,6)]
            },
            #"100-104": {
            #    "no": [(.86, 4)]
            #},
            "105-299": {
                "no": [(.74, 4), (.6,1), (.41, 2), (.39,1),(.24,4), (.13, 12)]
            }
        }
    },
    {
        "id": 5404, 
        "twitter_handle": "@realDonaldTrump", 
        "contract_map": [ ("14968", range(0, 60)), ("14963",range(60, 65)), ("14967", range(65, 70)), ("14965", range(70, 75)), ("14964",range(75, 80)), ("14966", range(80, 85)), ("14962",range(85, 200))],
        "positions": {
            #"0-59": {
            #    "yes": [(.12, 11)]
            #},
            #"60-64": {
            #    "yes": [(.11, 12)]
            #},
            #"70-74": {
            #    "yes": [(.04, 20)]
            #},
            #"80-84": {
            #    "no": [(.69, 4)]
            #},
            "85-199": {
                "no": [(.74, 4),(.64,1),(.54,1), (.34, 3)]
            }
        }
    },
    { 
        "id": 5411, 
        "twitter_handle": "@potus", 
        "contract_map": [ ("15008", range(0, 35)), ("15010",range(35, 40)), ("15011", range(40, 45)), ("15012", range(45, 50)), ("15013",range(50, 55)), ("15009", range(55, 60)), ("15014",range(60, 200))],
        "positions": {
            #"40-44": {
            #    "no": [(.94, 5)]
            #},
            #"45-49": {
            #    "no": [(.85, 3),(.86,1)]
            #},
            "50-54": {
                "no": [(.86, 3), (.83,1)]
            },
            "55-59": {
                "no": [(.86, 4)]
            },
            "60-199": {
                "yes": [(.56, 2), (.53,3), (.50,1),(.41,1)]
            }
        }
    }
]

def eval_markets(show_market_research=False):
    for market in markets:
        eval_twitter_market(market, "data/tweets/{handle}.csv".format(handle=market["twitter_handle"]), market["positions"], show_market_research)
        print("----------------------------------------\n\n")


In [11]:
eval_markets(show_market_research=False)

@vp tweets noon 4/5 - noon 4/12?
Days left: 3
Matching tweets: 16
Category probabilities:
0-24     0.057143
25-29    0.100000
30-34    0.128571
35-39    0.171429
40-44    0.100000
45-49    0.285714
50-99    0.157143
dtype: float64
BUY YES shares for contract 45-49: 3 shares @0.14 (EV: 0.14571428571428569, TOTAL: 0.42000000000000004)
0-24 35.400000000000006
25-29 -17.6
30-34 -20.6
35-39 -20.6
40-44 -20.6
45-49 2.4000000000000004
50-99 43.400000000000006
----------------------------------------


@whitehouse tweets 4/4 - 4/11?
Days left: 2
Matching tweets: 82
Category probabilities:
0-79       0.000000
80-84      0.000000
85-89      0.000000
90-94      0.069767
95-99      0.162791
100-104    0.186047
105-299    0.581395
dtype: float64
BUY YES shares for contract 100-104: 5 shares @0.17 (EV: 0.016046511627906962, TOTAL: 0.8500000000000001)
BUY YES shares for contract 95-99: 7 shares @0.06 (EV: 0.1027906976744186, TOTAL: 0.42)
SELL YES shares for contract 0-79_0.16_19.0: ALL shares @0.99 (