# Big Example

Now you're going to use all your new skills to try and predict the stock market.

Note that this is highly dubious due to the lack of data. (Note to self, improve this!)

But anyway, it's worth going through the motions, because later on we've got a proper example.

Below I'm going to prepare the data for you.

In [None]:
from pandas_datareader import data
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

In [None]:
news = pd.read_csv("data/news.csv", index_col=0, header=None)
news.columns = ["text"]
news.index = pd.to_datetime(news.index)
start_date, end_date = (min(news.index), max(news.index))

In [None]:
stock = "AAPL"
source = "yahoo"

In [None]:
apple_news = news[news["text"].str.contains(stock)]

In [None]:
# NBVAL_SKIP
# IF YOU GET AN ERROR HERE, JUST RUN IT AGAIN. RATE LIMITING.
apple_stock = data.DataReader(stock, source, start_date.date(), end_date.date())

In [None]:
# NBVAL_SKIP
c = apple_stock['Close']
o = apple_stock['Open']
pl_ratio = (o - c)/o

In [None]:
# NBVAL_SKIP
plt.figure(figsize=(15, 7))
pl_ratio.plot(label=stock)
plt.scatter(apple_news.index, [0] * len(apple_news.index), marker="x", color="g", label="News")
plt.ylabel("Open-Close Profit/Loss (%)")
plt.legend()
plt.show()

In [None]:
# NBVAL_SKIP
target = abs(pl_ratio) > 0.01

In [None]:
# NBVAL_SKIP
X = []
y = []
for d, t in zip(apple_news.index.date, apple_news["text"]):
    try: 
        val = target[str(d)]
        y.append(val)
        X.append(t)
    except KeyError:
        continue
        
X = pd.Series(X)

## Your turn

So that's the data prepared. Now its your turn.

In [None]:
# NBVAL_SKIP
def clean(X):
    # Lowercase everything
    X = X.str.lower()
    
    # Get rid of those duplicate backslashes
    X = X.str.replace(r'\\\\', r'\\', case=False)
    
    # Remove whitespace
    X = X.str.replace('"', ' ')
    X = X.str.replace('_', ' ')
    X = X.str.replace('-', ' ')
    X = X.str.replace(r'\n', ' ')
    X = X.str.replace(r'\\n', ' ')
    X = X.str.replace(r'\t', ' ')
    X = X.str.replace(r'\\t', ' ')
    X = X.str.replace(r"\\xa0", ' ') # A space
    X = X.str.replace(r"\\xc2", ' ') # A space
    X = X.str.replace(' +', ' ')
    
    # Ditch all other unicode
    X = X.str.decode("unicode_escape").str.encode('ascii', 'ignore').str.decode("utf-8")
    
    # Remove contractions
    X = X.str.replace("won't", "will not")
    X = X.str.replace("can't", "can not")
    X = X.str.replace("don't", "do not")
    X = X.str.replace("i'm", "i am")
    X = X.str.replace(" im", " i am")
    X = X.str.replace("ain't", "is not")
    X = X.str.replace("'ll", " will")
    X = X.str.replace("'t", " not")
    X = X.str.replace("'ve", " have")
    X = X.str.replace("'re", " are")
    X = X.str.replace("'d", " would")
    
    # Create tokens of interest
    X = X.str.replace(r"([#%&\*\$]{2,})(\w*)", r"_SW") # Swearword obfuscations
    X = X.str.replace(r" [8x;:=]-?(?:\)|\}|\]|>){2,}", " _BS") # Big smileys
    X = X.str.replace(r" (?:[;:=]-?[\)\}\]d>])|(?:<3)", " _S") # Smileys   
    X = X.str.replace(r" [x:=]-?[\(\[\|\\/\{<]', r", " _F") # Sad faces
    X = X.str.replace(r" [x:=]-?(?:\(|\[|\||\\|/|\{|<){2,}", " _BF") # Big Sad faces   
    X = X.str.replace(r"(@[a-z]+)", r"_AT") # Directed at someone
    X = X.str.replace(r"[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}", r"_EM") # Email
    X = X.str.replace(r"\w+:\/\/\S+", r"_U") # URL
    
    return X
X = clean(X)

def stemming(X):
    X = X.str.replace("ies( |$)", "y ")  # Plurals
    X = X.str.replace("s( |$)", " ")  # Plurals
    X = X.str.replace("ing( |$)", " ")  # adverbs
    X = X.str.replace("ed( |$)", " ")  # Past tense
    X = X.str.replace("your( |$)", "you ") # Personal
    X = X.str.replace("our( |$)", "us ") # Personal
    return X

X = stemming(X)

def stopwords(X):
    X = X.str.replace("the( |$)", " ")
    X = X.str.replace("and( |$)", " ")
    X = X.str.replace("of( |$)", " ")
    X = X.str.replace("on( |$)", " ")
    X = X.str.replace("or( |$)", " ")
    X = X.str.replace("to( |$)", " ")
    X = X.str.replace("in( |$)", " ")
    X = X.str.replace("at( |$)", " ")
    return X

X = stopwords(X)

def custom(X):
    X = X.str.replace("aapl", " ")
    X = X.str.replace("apple", " ")
    X = X.str.replace("inc", " ")
    return X

X = custom(X)

In [None]:
# NBVAL_SKIP
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
tfidf_vect = # !!!!! YOUR CODE HERE !!!!!
X_train_counts = tfidf_vect.fit_transform(X_train)
X_test_counts = tfidf_vect.transform(X_test)
print(X_train_counts.shape)
print(tfidf_vect.get_feature_names())

In [None]:
# NBVAL_SKIP
clf = # !!!!! YOUR CODE HERE !!!!!
clf.fit(X_train_counts, y_train)
scores = cross_val_score(clf, X_train_counts, y_train)
scores.mean()           

In [None]:
# NBVAL_SKIP
y_proba = clf.predict_proba(X_test_counts)
fpr, tpr, _ = metrics.roc_curve(y_test, y_proba[:,1])

plt.figure();
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', label='Random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC of Random Forest on Balanced TDIDF data')
plt.legend(loc="lower right")
plt.show()

In [None]:
# NBVAL_SKIP
def lit_confusion_matrix(y_true, y_pred):
    '''
    Reformat confusion matrix output from sklearn for plotting profit curve.
    '''
    [[tn, fp], [fn, tp]] = metrics.confusion_matrix(y_true, y_pred)
    return np.array([[tp, fp], [fn, tn]])

print(lit_confusion_matrix(y_test, clf.predict(X_test_counts)))

### Tasks

1. Clean the data
2. Train a TDIDF
3. Create a classifier
4. Generate a cross-validated accuracy
5. Plot the ROC curve (and see how little data we have!)

Who can get the highest accuracy? (No kudos for this one though. Because of the small amount of data and the complexity of the model, people will randomly get high scores just by chance.)