<a href="https://colab.research.google.com/github/nescoba/portafolio/blob/main/02031705.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finantial Data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set()

## Data Wrangling

In [4]:
dfs = []
for year in range(2014, 2019):
  dfs.append(pd.read_csv(f'{year}_Financial_Data.csv'))
  dfs[year - 2014]['Year'] = year
  dfs[year - 2014]['Price Var'] = dfs[year - 2014][f'{year+1} PRICE VAR [%]']
  dfs[year - 2014].drop(f'{year+1} PRICE VAR [%]', axis=1, inplace=True)

df = dfs[0]
for year in range(2015, 2019):
  df = pd.merge(df, dfs[year - 2014], how='outer')

df.shape

(22077, 226)

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

class ColumnDropper(BaseEstimator, TransformerMixin):
  def __init__(self, dropped_cols, format=True):
    self.format = format
    self.dropped_cols = dropped_cols

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    if self.format:
      return X.drop(self.dropped_cols, axis=1)

    else:
      return X


class NaFiller(BaseEstimator, TransformerMixin):
  def __init__(self, filled_cols, grouper, format=True):
    self.format = format
    self.filled_cols = filled_cols
    self.grouper = grouper

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    if self.format:
      # for column in self.filled_cols:
      #   for value in pd.unique(X[self.grouper]):
      #     mask = (X[column].isnull()) & (X[self.grouper]==value)
      #     X.loc[mask, column] = np.mean(X.loc[(X[column].notnull()) & (X[self.grouper]==value), column])
      for column in self.filled_cols:
        X.loc[X[column].isnull(), column] = X[column].mean()
    return X


class Encoder(BaseEstimator, TransformerMixin):
  def __init__(self, cols, format=True):
    self.format = format
    self.cols = cols
    
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    if self.format:
      for column in self.cols:
        matrix = OneHotEncoder(sparse=False).fit_transform(X[column].array.reshape(-1,1))
        columns = [f'{column}_{i}' for i in range(matrix.shape[1])]
        df = pd.DataFrame(matrix, columns = columns)
        X.drop(column, axis=1, inplace=True)
        return pd.concat([X, df], axis=1)
    else:
      return X


class Scaler(BaseEstimator, TransformerMixin):
  def __init__(self, cols, format=True):
    self.format = format
    self.cols = cols
    
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    if self.format:
      for column in self.cols:
        # scaled_column = StandardScaler().fit_transform(X[column].array.reshape(-1,1))
        # df = pd.DataFrame(scaled_column, columns=[f'scaled_{column}'])
        # X.drop(column, axis=1, inplace=True)
        #return pd.concat([X, df], axis=1)
        sigma = np.std(X[column])
        if sigma != 0:
          X[f'scaled_{column}'] = (X[column] - X[column].mean())/np.std(X[column])
          X.drop(column, axis=1, inplace=True)
        else:
          X[column] = 0
  
    return X



In [6]:
dropped_cols = ['Unnamed: 0', 'Price Var', 'Class']
num_columns = list(df.columns.copy())
non_num = ['Unnamed: 0', 'Price Var', 'Class', 'Sector']
for col in non_num:
  num_columns.remove(col)

y = df['Class']
pipeline = Pipeline([('col_dropper', ColumnDropper(dropped_cols, format =True)),
                     ('na_filler', NaFiller(num_columns, 'Sector', format=True)), 
                     ('scaler', Scaler(num_columns, format=True)), 
                     ('encoder', Encoder(['Sector'], format=True))])

X = pipeline.fit_transform(df)

X.shape


(22077, 233)

In [7]:
X.dropna().shape

(22077, 233)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0, train_size=0.2)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,  random_state=0, train_size=0.2)

### Defining function to evaluate classiffiers 

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

def eval_clf(clf):
  y_val_pred = clf.predict(X_val)
  scores = [accuracy_score(y_val, y_val_pred), f1_score(y_val, y_val_pred), 
            recall_score(y_val, y_val_pred), precision_score(y_val, y_val_pred)]
  names = ['accuracy', 'f1', 'recall', 'precision']
  return pd.Series(scores, index=names)


## Model Training 

### Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = {'C':[0.00000001, 0.0000001, .00001, 0.0001]}
log_reg = LogisticRegression()
grid_log_reg = GridSearchCV(log_reg, param_grid, scoring='f1')
grid_log_reg.fit(X_train, y_train)
grid_log_reg.best_params_


{'C': 1e-08}

In [17]:
eval_clf(grid_log_reg.best_estimator_)

accuracy     0.559740
f1           0.717735
recall       1.000000
precision    0.559740
dtype: float64

### Support Vector Machines 

In [19]:
from sklearn.svm import SVC 

param_grid = [{'kernel': ['poly'], 'degree': [3,4,5], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}, 
              {'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma':[0.001, 0.01, 0.1, 1, 10, 100]}]
svm = SVC()
grid_svm = GridSearchCV(svm, param_grid, scoring='f1')
grid_svm.fit(X_train, y_train)
grid_svm.best_params_

{'C': 10, 'gamma': 1, 'kernel': 'rbf'}

In [20]:
eval_clf(grid_svm.best_estimator_)

accuracy     0.578709
f1           0.712852
recall       0.934244
precision    0.576287
dtype: float64

### Random Forest 

In [21]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {'n_estimators': [100, 200, 300, 500, 1000], 
              'max_leaf_nodes':[1, 10, 20, 30, 50]}
forest = RandomForestClassifier()
grid_forest = GridSearchCV(forest, param_grid, scoring='f1')
grid_forest.fit(X_train, y_train)
grid_forest.best_params_

25 fits failed out of a total of 125.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/

{'max_leaf_nodes': 10, 'n_estimators': 100}

In [22]:
eval_clf(grid_forest.best_estimator_)

accuracy     0.620045
f1           0.700446
recall       0.793627
precision    0.626848
dtype: float64