# Introduction



## Library/Dataset Import

In [1]:
from dateutil.parser import parse 

# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd

# Matplotlib for visualization
from matplotlib import pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 120})

# Seaborn for easier visualization
import seaborn as sns
sns.set_style('darkgrid')

# Scikit-Learn's make_pipeline function
from sklearn.pipeline import make_pipeline

# Scikit-Learn's StandardScaler
from sklearn.preprocessing import StandardScaler

# Scikit-Learn's LinearRegression algorithm
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

# Import Regularized Regression algos
from sklearn.linear_model import Lasso, Ridge, ElasticNet

# Import Tree Ensemble algos
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Import model metrics. 
from sklearn import metrics



In [4]:
import quandl
quandl.ApiConfig.api_key = 'HybxHtVE5n-VgTJmhouM'
data = quandl.get('NSE/OIL')
#print( data.head())
print( data.shape )

(2299, 7)


In [20]:
df = quandl.get_table('SHARADAR/SF1', ticker='AAPL')
df.dimension.unique()

array(['MRY'], dtype=object)

In [4]:
quandl.get_table('SHARADAR/INDICATORS', table='SF1')

Unnamed: 0_level_0,table,indicator,isfilter,isprimarykey,title,description,unittype
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,SF1,revenue,N,N,Revenues,[Income Statement] Amount of Revenue recognize...,currency
1,SF1,cor,N,N,Cost of Revenue,[Income Statement] The aggregate cost of goods...,currency
2,SF1,sgna,N,N,Selling General and Administrative Expense,[Income Statement] A component of [OpEx] repre...,currency
3,SF1,rnd,N,N,Research and Development Expense,[Income Statement] A component of [OpEx] repre...,currency
4,SF1,opex,N,N,Operating Expenses,[Income Statement] Operating expenses represen...,currency
5,SF1,intexp,N,N,Interest Expense,[Income Statement] Amount of the cost of borro...,currency
6,SF1,taxexp,N,N,Income Tax Expense,[Income Statement] Amount of current income ta...,currency
7,SF1,netincdis,N,N,Net Loss Income from Discontinued Operations,[Income Statement] Amount of loss (income) fro...,currency
8,SF1,consolinc,N,N,Consolidated Income,[Income Statement] The portion of profit or lo...,currency
9,SF1,netincnci,N,N,Net Income to Non-Controlling Interests,[Income Statement] The portion of income which...,currency


In [6]:
quandl.get_table('SHARADAR/INDICATORS', indicator='revenue')



Unnamed: 0_level_0,table,indicator,isfilter,isprimarykey,title,description,unittype
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,SF1,revenue,N,N,Revenues,[Income Statement] Amount of Revenue recognize...,currency


In [3]:
import glob
path = r'C:\Users\Matthew\Documents\Machine Learning Accelerator\Capstone Project\1 Predict Stock Price\Internal'
all_files = glob.glob(path + "/*.csv")

li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=0, header=0, parse_dates=True, infer_datetime_format=True)
    df = df.transpose(copy = True)
    name = str(filename).strip(path)
    df['Financial_Statement'] = name[:-4]

    li.append(df)
    print(name)
    # Concatenate all data into one DataFrame
internal_raw = pd.concat(li, axis=1, ignore_index=False)


Balance_Sheet_10yr_Qtrly.csv
Flow_Sheet_10yr_Qtrly.csv
_Sheet_10yr_Qtrly.csv


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


In [None]:
# Change dtype of Financial Statment to Category.
internal_raw['Financial_Statement'] = internal_raw['Financial_Statement'].astype('category')
# Double Check above.
internal_raw['Financial_Statement'].dtypes
# Change Filename of Financial Statements.
internal_raw['Financial_Statement'] = internal_raw['Financial_Statement'].replace(['Flow_Sheet_10yr_Qtrly', '_Sheet_10yr_Qtrly'], ['Cash_Flow_Sheet_10yr_Qtrly', 'Income_Sheet_10yr_Qtrly'])
internal_raw['Financial_Statement'].dtypes

In [None]:
# Encode categorical variables into indicator features.
internal_raw['Financial_Statement_cat'] = internal_raw['Financial_Statement'].cat.codes
internal_raw.head()

In [None]:
internal_raw['Financial_Statement'].unique()

In [None]:
internal_raw.index.name = 'Date'
#internal_raw

In [None]:
internal_raw['Date'] = pd.to_datetime(internal_raw.index, format='%d-%m-%y')
internal_raw['Day'] = internal_raw['Date'].dt.day
internal_raw['Month'] = internal_raw['Date'].dt.month
internal_raw['Year'] = internal_raw['Date'].dt.year
internal_raw.head()

In [None]:
print(internal_raw['Year'].unique())

In [None]:
internal_raw.groupby('Month').agg(['min', 'median', 'mean', 'max'])

In [None]:
# Null values along columns (axis=1)
internal_raw[internal_raw.isnull().any(axis=1)]

from pandas import Grouper

groups = internal_raw.groupby(Grouper(key='Month', axis=1))
years = internal_raw
for name, group in groups:
    years[name.year] = group.values
years.plot(subplots=True, legend=False)
plt.figure(figsize=(60,30), dpi=200)
plt.show()

import glob

path = r'C:\Users\Matthew\Desktop\HSBC MPF 2019' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    
    df = pd.read_csv(filename, index_col=0, header=0)
    
    df = df.transpose(copy = True)
    Mask = df['Constituent Fund'] == 'BID'
    df = df[Mask]
    
    for col in df.columns:
        
        df['{}'.format(col)] = df['{}'.format(col)].str.rstrip('\t')
        df.index = df.index.str.lstrip('\t')
        
    li.append(df)
    # Concatenate all data into one DataFrame
frame = pd.concat(li, axis=0, ignore_index=False)
frame

## Exploratory Analysis
Internal --> External Factors

In [None]:
# Numerical description
internal_raw.describe()

In [None]:
internal_raw.dtypes.unique()

In [None]:
internal_raw.set_index(['Year','Month'], inplace=True, verify_integrity=False)
internal_raw.sort_index(inplace=True,na_position='first')


In [None]:
internal_raw.head(20)

## Data-Cleaning
Internal --> External Factors

## Feature Engineering 
Internal --> External Factors

### Balance Sheet


In [None]:
# Assets(cash, accounts receivable) 


# Liabilities(expense & debt) 


# Shareholder equity(equity capital invest., retained earnings from periodic net income)


# Balance: (Assets - Liabilities = Shareholder’s Equity = Book Value)


# Liquidation Value = Market Price to Book ratio 



### Income Statement

 **Profit margin helps to show where company costs are low or high at different points of the operations.**

In [None]:
# Revenue - Direct Costs = Gross Profit (margin = /Revenue)


# Gross Profit - Indirect Expenses = Operating Profit (margin = /Revenue)

# Operating Profit - Interest & Taxes = Net Profit (margin = /Revenue)



### Cash Flow Statement
**Bottom line = how much cash available in company**

In [None]:
# Operating activities, CF + Net Income 


# Investing activities, CF + Firmwide Investments


# Financing activities, CF + Debt & Equity Financing




Model Evalutation 
Internal compare External Factors

## Algorithm Selection 

## Model Training

## Insights & Analysis