# Formatting/Cleaning the Data

## Importing the Necessary Libraries

In [516]:
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 70)
import numpy as np
from tqdm import tqdm_notebook as tqdm
import _pickle as pickle

In [508]:
with open("stocks_df.pickle",'rb') as fp:
    stocks_df = pickle.load(fp)

## Setting the index as a date for each DataFrame

In [509]:
def setting_index(df):
    df['Quarter end'] = pd.to_datetime(df['Quarter end'])
    df.set_index("Quarter end", inplace=True)
    return df

In [510]:
for i in stocks_df.keys():
    stocks_df[i] = setting_index(stocks_df[i])

### Replacing "None" values

In [511]:
for i in tqdm(stocks_df.keys()):
    stocks_df[i].replace("None", np.nan, inplace=True)

HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

## Filtering the Data

### Dropping columns only containing NaN values

In [512]:
for i in tqdm(stocks_df.keys()):
    stocks_df[i].dropna(axis=1, how='all', inplace=True)


HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

### Removing DataFrames from the dictionary with less than 100 rows
Also, removed those containing more than 30 NaN values in any of their columns

In [513]:
rem = []
for i in stocks_df.keys():
    if len(stocks_df[i]) < 100:
        rem.append(i)
        
for i in rem:
    del stocks_df[i]
    
print("Remaining stock DataFrames in the dictionary: ", len(stocks_df))

Remaining stock DataFrames in the dictionary:  235


In [514]:
rem = []

for i in tqdm(stocks_df.keys()):
    # Iterating through the columns to check the total of NaN values
    for c in stocks_df[i].columns:
        nan = stocks_df[i][c].isna().sum()
        if nan > 30 and i not in rem:
            rem.append(i)
            
for i in rem:
    del stocks_df[i]

print("Remaining stock DataFrames in the dictionary: ", len(stocks_df))

HBox(children=(IntProgress(value=0, max=235), HTML(value='')))

Remaining stock DataFrames in the dictionary:  201


After filtering out the stocks and dataframes to our specifications, we are left with 201 usable stocks.

### Converting all values to numerical values

In [518]:
for i in tqdm(stocks_df.keys()):
    cols = stocks_df[i].columns
    for c in cols:
        stocks_df[i][c] = pd.to_numeric(stocks_df[i][c])

HBox(children=(IntProgress(value=0, max=201), HTML(value='')))

### Filling the NaN values with the averages in each of their respective columns

In [519]:
for i in tqdm(stocks_df.keys()):
    stocks_df[i].fillna(stocks_df[i].mean(), inplace=True)

HBox(children=(IntProgress(value=0, max=201), HTML(value='')))

### Dropping columns with a total sum of 0

In [521]:
for i in tqdm(stocks_df.keys()):
    cols = stocks_df[i].columns
    for c in cols:
        if stocks_df[i][c].sum() == 0:
            stocks_df[i].drop(c, inplace=True, axis=1)

HBox(children=(IntProgress(value=0, max=201), HTML(value='')))

## Creating a new DataFrame with usable data

### Function returning whether or not the price will increase in the next quarter

In [522]:
def price_increase(df, percent=1.00):
    """
    Returns a boolean determining whether the price will increase in the next quarter
    Optional: determining if the price increased by more than a specified percentage
    """
    return (df.Price.shift(1)*percent) > df.Price

### Finding the percent improvement from previous quarters to scale the data
Also adding another column containing the boolean statement from the price_increase() function.

In [523]:
def percent_improvement(df, cols):
    """
    Creates a new DataFrame, 
    then shows the percent change from the previous to the current quarter
    """
    pcnt_df = pd.DataFrame(index=df.index)
    
    pcnt_df['price_will_increase?'] = price_increase(df)
    
    for col in cols:
        if df[col].dtype != bool:
            pcnt_df[col+" %-increase"] = (df[col] / df[col].shift(-1)) - 1
    
    return pcnt_df

** Some NaNs may return due to a divide by zero.

In [524]:
# Creating a new dictionary to store our new percentage DataFrames
pcnt_df = {}

for i in tqdm(stocks_df.keys()):
    pcnt_df[i] = percent_improvement(stocks_df[i], stocks_df[i].columns)

HBox(children=(IntProgress(value=0, max=201), HTML(value='')))

### Excluding the first and last rows
This is done because the last row has no data to compare percent improvements to and the first row does not have any data to show if the price will increase in the future.

In [525]:
for i in tqdm(pcnt_df.keys()):
    pcnt_df[i] = pcnt_df[i][1:-1]

HBox(children=(IntProgress(value=0, max=201), HTML(value='')))

In [506]:
with open("percent_df.pickle", 'wb') as fp:
    pickle.dump(pcnt_df, fp)