In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns

### Data Import using Pandas

In [29]:
df = pd.read_csv('lecture_data.csv')

In [32]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2006-01-03,10.34,10.68,10.32,10.68,201853036
1,2006-01-04,10.73,10.85,10.64,10.71,155225609
2,2006-01-05,10.69,10.7,10.54,10.63,112396081
3,2006-01-06,10.75,10.96,10.65,10.9,176139334
4,2006-01-09,10.96,11.03,10.82,10.86,168861224


### Operations

In [4]:
# Unpacks the date column to multiple features
from datetime import datetime

def process_dates(df, column='Date', dform='%Y-%m-%d', year=True, month=True, 
                  day=True, weekday=True, inplace=False, drop_date=False):
    # Extracts desired features from date
    dates = df[column]
        
    date_dict = {'year': [], 'month': [], 'day': [], 'weekday': []}
    
    for i in range(len(df)):
        date = datetime.strptime(dates[i], dform)
        if year:
            date_dict['year'].append(date.year)
        if month:
            date_dict['month'].append(date.month)
        if day:
            date_dict['day'].append(date.day)
        if weekday:
            date_dict['weekday'].append(date.weekday())
    
    for feature in date_dict:
        if len(date_dict[feature]) == 0:
            del date_dict[feature]
            
    if drop_date:
        df.drop(column, inplace=True, axis=1)
    
    if inplace:
        for key in date_dict:
            df[key] = date_dict[key]
        return None
            
    return date_dict

In [26]:
process_dates(df, inplace=True, drop_date=True)

In [27]:
df.head()

Unnamed: 0,Open,High,Low,Close,Volume,year,month,day,weekday
0,10.34,10.68,10.32,10.68,201853036,2006,1,3,1
1,10.73,10.85,10.64,10.71,155225609,2006,1,4,2
2,10.69,10.7,10.54,10.63,112396081,2006,1,5,3
3,10.75,10.96,10.65,10.9,176139334,2006,1,6,4
4,10.96,11.03,10.82,10.86,168861224,2006,1,9,0


### Scaling 

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()
snp = scaler.fit_transform(df)

In [9]:
# Returns a NumPy array!
type(snp)

numpy.ndarray

In [10]:
sdf = pd.DataFrame(snp, columns=df.columns)

In [11]:
sdf

Unnamed: 0,Open,High,Low,Close,Volume,year,month,day,weekday
0,-1.220492,-1.218115,-1.214590,-1.212307,0.694305,-1.593657,-1.620300,-1.457731,-0.730916
1,-1.211731,-1.214320,-1.207354,-1.211633,0.236451,-1.593657,-1.620300,-1.343360,-0.016101
2,-1.212630,-1.217668,-1.209616,-1.213430,-0.184110,-1.593657,-1.620300,-1.228989,0.698715
3,-1.211282,-1.211865,-1.207128,-1.207366,0.441811,-1.593657,-1.620300,-1.114618,1.413531
4,-1.206565,-1.210303,-1.203284,-1.208264,0.370344,-1.593657,-1.620300,-0.771504,-1.445732
...,...,...,...,...,...,...,...,...,...
3014,2.471086,2.458761,2.497915,2.478091,-1.127232,1.593369,1.592151,0.715322,1.413531
3015,2.383929,2.370599,2.388923,2.378381,-0.961911,1.593369,1.592151,1.172807,-0.730916
3016,2.368205,2.355199,2.389602,2.379055,-1.076674,1.593369,1.592151,1.287178,-0.016101
3017,2.388422,2.379081,2.407013,2.389834,-1.125948,1.593369,1.592151,1.401550,0.698715


### Converting our data to something usable in non-series models

In [12]:
def create_series(df, col, out='Target', inplace=False):
    
    if not inplace:
        df = df.copy()
    
    next_list = []
    for i in range(1, len(df)):
        next_list.append(df.iloc[i][col])
        
    df.drop(len(df) - 1, inplace=True)
    df[out] = next_list
    
    if not inplace:
        return df
    return None

def create_bin_series(df, col, out='Target', inplace=False):
    if not inplace:
        df = df.copy()
    
    next_list = []
    for i in range(1, len(df)):
        next_list.append(df.iloc[i][col] - df.iloc[i-1][col])
        
    for i in range(len(next_list)):
        if next_list[i] >= 0:
            next_list[i] = 1
        else:
            next_list[i] = 0
        
    df.drop(len(df) - 1, inplace=True)
    df[out] = next_list
    
    if not inplace:
        return df
    return None

In [13]:
create_bin_series(sdf, col='Close', inplace=True)

In [14]:
sdf.head()

Unnamed: 0,Open,High,Low,Close,Volume,year,month,day,weekday,Target
0,-1.220492,-1.218115,-1.21459,-1.212307,0.694305,-1.593657,-1.6203,-1.457731,-0.730916,1
1,-1.211731,-1.21432,-1.207354,-1.211633,0.236451,-1.593657,-1.6203,-1.34336,-0.016101,0
2,-1.21263,-1.217668,-1.209616,-1.21343,-0.18411,-1.593657,-1.6203,-1.228989,0.698715,1
3,-1.211282,-1.211865,-1.207128,-1.207366,0.441811,-1.593657,-1.6203,-1.114618,1.413531,0
4,-1.206565,-1.210303,-1.203284,-1.208264,0.370344,-1.593657,-1.6203,-0.771504,-1.445732,1


### Test Train Split

In [15]:
# Give first 70% of data for training
# Using first 70% because the data is time series
split = round(len(sdf) * .7)

# Extracting NumPy array from Pandas Dataframe
X = sdf.values[:,:-1]
y = sdf.values[:, -1]

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

### Model and training

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
%time
# n_estimators is our hyperparamter
rfc = RandomForestClassifier(n_estimators=100, random_state=407, n_jobs=-1)
rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


### Performance evaluation

In [18]:
from sklearn.metrics import classification_report,confusion_matrix

In [19]:
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))

[[240 193]
 [232 240]]
              precision    recall  f1-score   support

         0.0       0.51      0.55      0.53       433
         1.0       0.55      0.51      0.53       472

    accuracy                           0.53       905
   macro avg       0.53      0.53      0.53       905
weighted avg       0.53      0.53      0.53       905



In [20]:
train_preds = rfc.predict(X_train)

In [21]:
print(confusion_matrix(y_train, train_preds))
print(classification_report(y_train, train_preds))

[[ 996    0]
 [   0 1117]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       996
         1.0       1.00      1.00      1.00      1117

    accuracy                           1.00      2113
   macro avg       1.00      1.00      1.00      2113
weighted avg       1.00      1.00      1.00      2113



In [22]:
rfc = RandomForestClassifier(n_estimators=5, random_state=407, n_jobs=-1)
rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)

print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))

train_preds = rfc.predict(X_train)

print(confusion_matrix(y_train, train_preds))
print(classification_report(y_train, train_preds))

[[164 269]
 [145 327]]
              precision    recall  f1-score   support

         0.0       0.53      0.38      0.44       433
         1.0       0.55      0.69      0.61       472

    accuracy                           0.54       905
   macro avg       0.54      0.54      0.53       905
weighted avg       0.54      0.54      0.53       905

[[ 931   65]
 [  60 1057]]
              precision    recall  f1-score   support

         0.0       0.94      0.93      0.94       996
         1.0       0.94      0.95      0.94      1117

    accuracy                           0.94      2113
   macro avg       0.94      0.94      0.94      2113
weighted avg       0.94      0.94      0.94      2113

