In [115]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import set_config; set_config(display='diagram')
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

In [116]:
df = pd.read_csv('../data/stocks_quarterly.csv')

In [117]:
cols_to_drop = df.isnull().mean().sort_values(ascending = False).head(45).index

In [118]:
cols_to_drop = list(cols_to_drop) + ['fiscalDateEnding','reportedDate','price','nasd_price','next_year_date','next_year_price','nasd_ny_price','symbol','Nasdaq_Performance', 'Stock_Performance','Label']

In [119]:
columns = list(df.columns)

for colum in cols_to_drop:
    columns.remove(colum)

In [120]:
X = df.drop(columns = 'Label')
y = df.Label

In [121]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2)

In [122]:
column_transformer = make_column_transformer((KNNImputer(),columns),
                                              remainder="drop")

pipline = make_pipeline(column_transformer, StandardScaler())

In [123]:
X_train = pipline.fit_transform(X_train)
X_test = pipline.transform(X_test)

In [124]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_depth=110, max_features=3, min_samples_leaf=3,
                      min_samples_split=8, n_estimators=200)

cv = cross_validate(forest,X_train,y_train,cv =5, scoring = ['accuracy','precision'] )

In [125]:
cv['test_precision'].mean()

0.6963318596948477

In [126]:
from sklearn.metrics import confusion_matrix

In [127]:
forest.fit(X_train,y_train)
y_pred = forest.predict(X_test)

In [129]:
matrix = confusion_matrix(y_test,y_pred,labels = [1,0])

In [134]:
matrix[0][0]/(matrix[0][0]+matrix[1][0])

0.7126436781609196

In [None]:

# automatic svm hyperparameter tuning using skopt for the ionosphere dataset
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/ionosphere.csv'
dataframe = read_csv(url, header=None)
# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)
# define search space
params = dict()
params['C'] = (1e-6, 100.0, 'log-uniform')
params['gamma'] = (1e-6, 100.0, 'log-uniform')
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the search
search = BayesSearchCV(estimator=SVC(), search_spaces=params, n_jobs=-1, cv=cv)
# perform the search
search.fit(X, y)
# report the best result
print(search.best_score_)
print(search.best_params_)