# Twitter Stocker
## Using the Twitter API to implement sentiment analysis on different sets of stocks

## Section 1) Getting the Tweepy API working for Python

* Import the dependencies for the rest of the project here, this section will be kept up-to-date
* Create a twitter account (out of scope) and through the developer section, get a set of `OAuth` credentials
* Instantiate the api and make some test searches
* Define some helper functions

#### 1) Import the necessary libraries 

In [24]:
import pandas as pd
import numpy as np

In [25]:
import tweepy
from tweepy import OAuthHandler, Stream, StreamListener
tweepy.__version__

'3.5.0'

In [26]:
import datetime
import pandas_datareader.data as pdr
import pytz

In [84]:
import nltk
import re

In [28]:
nltk.__version__

'3.2.4'

In [29]:
import configparser

In [30]:
config = configparser.ConfigParser()

In [31]:
config.read('config/keys.txt')

['config/keys.txt']

#### 2) Authenticate User: you should use ideally use your own login credentials

In [32]:
consumer_key = config['DEFAULT']['consumer_key']
consumer_secret = config['DEFAULT']['consumer_secret']
access_token = config['DEFAULT']['access_token']
access_secret = config['DEFAULT']['access_secret']

In [33]:
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

# plug into the matrix
api = tweepy.API(auth)

#### 3) Test the tweepy api with Microsoft

In [34]:
test_stock = 'Microsoft'
test_ticker = 'MSFT'

In [35]:
search_results = api.search(q=test_stock, count=10, lang='en')

In [36]:
for s in search_results[:3]:
    print(s.created_at)
    print(s.text)

2019-04-03 00:32:02
@SavinTheBees Microsoft came out and basically said there are other consoles that people prefer over Xbox, so they’… https://t.co/gFP3EuZ3IT
2019-04-03 00:32:01
Microsoft stops selling ebooks and will refund customers for previous purchases
2019-04-03 00:31:58
@NatashaVianna @CharlottePigg Save your allowance and buy IBM, Microsoft, Apple, Amazon, and Google stocks.


#### 4) Define helper functions:

* One to get the corpus from a tweet
* One to perform a twitter search with a string and collect 100 tweets before a given date

In [37]:
def get_corpus(status):
    """
    Given a tweepy.models.Status object, returns the corpus as a str object
    
    :params: status, tweepy.models.Status
    :returns: corpus, str
    """
    if isinstance(status, tweepy.models.Status):
        return status.text
    else:
        raise TypeError("Input not of type tweepy.api.Status")

In [76]:
def collect_tweets(query, 
                   limit=1000,
                   dt=datetime.datetime.now(),
                   tz='US/Eastern'):
    
    assert(isinstance(query, str))

    local_tz = pytz.timezone(tz)
    local_dt = local_tz.localize(dt)
    
        
    valid_results = []
    for s in tweepy.Cursor(api.search, q=query, rpp=10,count=100, lang='en').items(limit):
        if local_tz.localize(s.created_at) < local_dt:
            
            valid_results.append(s)
            
    if len(valid_results) < 100:
        print("WARN: Less than 100 results, you should be using expanded_search()")
        
    return valid_results[:100]
    

In [40]:
tweets = collect_tweets('GRANITE')

In [41]:
len(tweets)

100

In [42]:
tweets[0].text

'Man, planks have destroyed my elbows. One would think I was doing them on granite. Sigh'

## Section 2: Determining the Winners and Losers for a given day

* Define the scope of the problem. Here, I have chosen the NASDAQ index and have pulled a .csv file of the companies
* Define functions to find the winners and losers, winners and losers are defined by their diff = price_close - price_open

In [43]:
datetime.datetime.now()

datetime.datetime(2019, 4, 2, 20, 32, 28, 421016)

#### 1) Read in all companies on the NASDAQ, I have pre-populated a list of the companies

In [44]:
ticker_df = pd.read_csv('files\companylist.csv')

In [45]:
ticker_df.columns

Index(['Symbol', 'Name', 'LastSale', 'MarketCap', 'IPOyear', 'Sector',
       'industry', 'Summary Quote', 'Unnamed: 8'],
      dtype='object')

In [55]:
ticker_df.head()

Unnamed: 0,Symbol,Name,LastSale,MarketCap,IPOyear,Sector,industry,Summary Quote,Unnamed: 8
0,YI,"111, Inc.",6.51,$530.85M,2018.0,Health Care,Medical/Nursing Services,https://www.nasdaq.com/symbol/yi,
1,PIH,"1347 Property Insurance Holdings, Inc.",5.2899,$31.81M,2014.0,Finance,Property-Casualty Insurers,https://www.nasdaq.com/symbol/pih,
2,PIHPP,"1347 Property Insurance Holdings, Inc.",24.5,$17.15M,,Finance,Property-Casualty Insurers,https://www.nasdaq.com/symbol/pihpp,
3,TURN,180 Degree Capital Corp.,1.86,$57.89M,,Finance,Finance/Investors Services,https://www.nasdaq.com/symbol/turn,
4,FLWS,"1-800 FLOWERS.COM, Inc.",18.34,$1.18B,1999.0,Consumer Services,Other Specialty Stores,https://www.nasdaq.com/symbol/flws,


In [46]:
tickers = ticker_df['Symbol']
tickers.head()

0       YI
1      PIH
2    PIHPP
3     TURN
4     FLWS
Name: Symbol, dtype: object

In [47]:
# useful datetime variable
start_dt = datetime.datetime(2019, 3, 27)
end_dt = start_dt + datetime.timedelta(1)

#### 3) define three helper functions: 

* One to compute the stock's gain or loss for a given day
* One to compile a dictionary of all (or some) of the NASDAQs stocks' gains/losses for a given day.
By default, the limit is set at 50 out of the 3500 or so NASDAQ companies. Simply set limit=None to
scan the entire NASDAQ index
* One to identify the winners and losers given the dictionary and return a winner's dictionary and a loser's
dictionary based on the differential

In [48]:
def _get_diff(df):
    assert(df.shape[0] == 1)
    return float(df['close'] - df['open']) / float(df['open'])

In [49]:
def get_all_diffs(tickers, 
                  limit=50, 
                  dt=datetime.datetime.now() - datetime.timedelta(1)):
    
    if limit and limit < len(tickers):
        _tickers = tickers[:limit]
    else:
        _tickers = tickers
    
    _diffs = dict()
    
    for ticker in _tickers:
    
        try:
            _df = pdr.DataReader(ticker, 'iex', dt, dt)
            diff_value = _get_diff(_df)
            _diffs[ticker] = diff_value
        except Exception as e:
            print(ticker, _df.shape)
            
    return _diffs

In [50]:
def find_gainers_and_losers(diff_dict):
    
    _df = pd.DataFrame([diff_dict.keys(), diff_dict.values()]).T
    _df.dropna(axis=0, inplace=True)
    _df.columns = ['ticker', 'diff']
    _df.sort_values('diff', inplace=True, ascending=False)
    _df.set_index('ticker', inplace=True, drop=True)
    
    winners = _df.iloc[:3, :]
    losers = _df.iloc[-3:, :]
    
    return winners.to_dict()['diff'], losers.to_dict()['diff']
    

In [51]:
diffs = get_all_diffs(tickers, 200, start_dt)

JFKKU (0, 5)
ADILW (0, 5)
ALACR (0, 5)
ALACU (0, 5)
ALACW (0, 5)
ALIT (1, 5)
ALGRR (0, 5)
ALGRU (0, 5)
SMCP (0, 5)
AMCIU (0, 5)
AMCIW (0, 5)


In [52]:
gainers, losers = find_gainers_and_losers(diffs)

In [53]:
gainers

{'ABEOW': 0.08098591549295775,
 'AMRS': 0.20618556701030924,
 'AMRWW': 0.8095238095238093}

In [54]:
losers

{'AGFSW': -0.1666666666666666,
 'AKAO': -0.0758823529411765,
 'AMRB': -0.0825057295645531}

## Section 3: Combining the previous sections

Now that we have achieved the basic abilities to use the tweepy API and the pandas data reader, we want to abstract
their functionality into a more structured piece of software with clear inputs and outputs and robust parameter handling

#### We need to define a function that will take in the corpus and return a DataFrame with all the desired fields

In [80]:

    
def assemble_corpus(tweet_list):
    pass
    

#### We can define the expanded_search() function here to ensure we have 100 tweets per company

In [77]:
def expanded_search(ticker, df):
    
    alt_terms = df[df.loc[:, 'Symbol'] == ticker].loc[:, ['Symbol', 'Name', 'Sector']].values.tolist()[0]
    
    valid_results = []
    for term in alt_terms:
        _sub_results = collect_tweets(term)
        valid_results.extend(_sub_results)
        
        if len(valid_results) > 100:
            return valid_results[:100]

In [78]:
tweet_list = expanded_search('AMRB', ticker_df)
type(tweet_list)

WARN: Less than 100 results, you should be using expanded_search()
WARN: Less than 100 results, you should be using expanded_search()


list

In [79]:
len(tweet_list)

100

In [81]:
texts = [t.text for t in tweet_list]

In [83]:
x = texts[0]
x

'$0.25 EPS Expected for American River Bankshares $AMRB  https://t.co/mI1NOxJiNt'

In [167]:
class PreProcessor:
    
    def __init__(self):
        self._text = None
    
    def process_text(self, text):
        
        self._text = self._remove_https_tag(text)
        self._text = self._tokenize(self._text)
        self._text = self._lemmatize(self._text)
        
        self._text = self._lower(self._text)
        self._text = self._remove_symbols(self._text)
        return self._text
        
    
    @staticmethod
    def _remove_https_tag(raw):
        return re.sub('https://[\w\.\/]+', '', raw).strip()
    
    @staticmethod
    def _tokenize(raw):
        return nltk.word_tokenize(raw)
    
    @staticmethod
    def _stem(raw_tokens):
        porter = nltk.PorterStemmer()
        return [porter.stem(t) for t in raw_tokens]
    
    @staticmethod
    def _lemmatize(raw_tokens):
        wnl = nltk.WordNetLemmatizer()
        return [wnl.lemmatize(t) for t in raw_tokens]
    
    @staticmethod
    def _lower(raw_tokens):
        return [t.lower() for t in raw_tokens]
    
    @staticmethod
    def _remove_nonwords(raw_tokens):
        return [t for t in raw_tokens if t in nltk.corpus.words.words('en')]
    
    @staticmethod
    def _remove_symbols(raw_tokens):
        # bad_char = ['`','~','!','@','#','$','%','^','&','*','(',')','-','_','+','=','/','\\']
        _t = [t for t in raw_tokens if not re.match("\d+\.?\d+", t)]
        _t = [t for t in _t if t.isalpha()]
        return _t

    @staticmethod
    def _remove_stopwords(raw_tokens):
        return [t for t in raw_tokens if not t in nltk.corpus.stopwords.words('english')]
    

In [163]:
preprocessor = PreProcessor()

In [164]:
tokens = preprocessor.process_text(x)

In [168]:
x

'$0.25 EPS Expected for American River Bankshares $AMRB  https://t.co/mI1NOxJiNt'

True