In [2]:
import pandas as pd
import numpy as np
from mlchartist.preprocessing import to_date, proper_name, proper_col, calculate_real_returns, get_indicators, calculate_past_returns
import os

In [3]:
def transform_file(filename):
    """
    Applies all preprocessing steps (preprocessing.py) to a single file,
    takes file path, returns dataframe
    """
    df = pd.read_csv(filename)
    df.columns = [proper_name(col) for col in df.columns]
    df['date'] = to_date(df, 'date')
    df = proper_col(df)
    df.drop(columns=['per', 'time', 'openint'], inplace=True)
    df = get_indicators(df)
    df_final = calculate_real_returns(df)
    df_final = calculate_past_returns(df_final)
    df_final = df_final.dropna().drop(columns = ['open', 'high','low','close', 'vol']).reset_index(drop=True)
    return df_final

def save_ticker(df, pathname):
    """
    Saves final dataframe to the pathname destination, assumes pathname exists
    """
    df.to_csv(pathname, index=False)


In [4]:
def build_data(raw_data_folder=r'../raw_data/data/daily/us/nasdaq stocks/', destination_path=r'../raw_data/processed/', len_hist=60):
    """
    Transforms and stores at destination_path all .txt files in raw_data_folder.
    The function assumes destination_path is a folder that exists!


    len_hist is a min number of rows in a file
    """
    files_changed = 0
    for subdir, dirs, files in os.walk(raw_data_folder):
        for filename in files:
            filepath = subdir + os.sep + filename
            if not subdir.endswith('.ipynb_checkpoints'):
                if filename.endswith('txt'):
                    with open(filepath) as f:
                        rows_num = sum(1 for line in f)
                        if rows_num >= len_hist:
                            df = transform_file(filepath)
                            new_name = filename[:-7] + '.csv'
                            targetpath = destination_path + os.sep + new_name
                            save_ticker(df, targetpath)
                            files_changed += 1
    print(f'Number of files transformed {files_changed}')

In [5]:
!pwd

/home/bob/code/marcin-sobocinski/mlchartist/notebooks/companies_clusters


In [6]:
build_data(raw_data_folder='../../raw_data/nasdaq_stocks/', destination_path='../../raw_data/processed/')

  dip[i] = 100 * (self._dip[i] / self._trs[i])
  din[i] = 100 * (self._din[i] / self._trs[i])
  dip[i] = 100 * (self._dip[i] / self._trs[i])
  din[i] = 100 * (self._din[i] / self._trs[i])
  dip[i + self._window] = 100 * (self._dip[i] / self._trs[i])
  din[i + self._window] = 100 * (self._din[i] / self._trs[i])


Number of files transformed 3544
