In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Machine Learning & Reinforcement Learning 
## Session 06b - Data Organization
### Index Data

<img src='../../../prasami_images/prasami_color_tutorials_small.png' style = 'width:400px;' alt="By Pramod Sharma : pramod.sharma@prasami.com" align="left"/>

In [3]:
###------------------
### Import statements
###------------------

import os
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
# Some Basic Parameters for housekeeping

# set location of input files:
inpDir = os.path.join('..','..', '..', 'input')

# set location of inputs for this module
moduleDir = 'machine_learning'

# set location of output files
outDir = os.path.join('..', 'output')

# define and set random state 
RANDOM_STATE = 24
np.random.seed(RANDOM_STATE) # Set Random Seed for reproducible  results

NOISE = 0.1
N_SAMPLES = 1000
ALPHA = 0.001

# parameters for Matplotlib
params = {'legend.fontsize': 'small',
          'figure.figsize': (15, 5),
          'axes.labelsize': 'large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'large',
          'ytick.labelsize':'large',
          'savefig.dpi': 75,
          'image.interpolation': 'none',
          'savefig.bbox' : 'tight',
          'lines.linewidth' : 1,
          'legend.numpoints' : 1
         }

CMAP = plt.cm.rainbow
plt.rcParams.update(params);
plt.set_cmap(CMAP);

plt.style.use('seaborn-v0_8-darkgrid') # plt.style.use('ggplot')

TEST_SIZE = 0.25

<Figure size 1500x500 with 0 Axes>

## Missing Data and Imputer Class
Lets look at Index Data

## Data

|       | **Attribute** | **Information** |
| :---  |     :---      |       :---      |      
| **1** | **Index** | A stock market index is a measure of the performance of a particular group of stocks representing a portion of the overall market |                
| **2** | **Date** | The date refers to the specific day on which the stock market data was recorded |                        
| **3** | **Open** | The price of the security at the beginning of the trading day |
| **4** | **High** | The highest price at which the security traded during the day |
| **5** | **Low** | The lowest price at which the security is traded during the day |                     
| **6** | **Close** | The price of the security at the end of the trading day |
| **7** | **Adj Close** | The adjusted closing price takes into account any corporate actions, such as stock splits, dividends, or rights offerings, that occurred after the markets closed. It reflects the closing price of the stock adjusted for these actions |  
| **8** | **Volume** | The total number of shares or contracts that were traded during the day |                      

In [33]:
market_df = pd.read_csv("market_index_data.csv")
market_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'market_index_data.csv'

### Pick up a unique index

Taking index as NYA!

In [None]:
data_df = market_df[market_df['Index'] == 'NYA'].copy()
data_df.shape

In [None]:
data_df.dtypes

Both numerical and categorical columns are present!

In [None]:
data_df['Date'] = pd.to_datetime(data_df['Date'], utc=True)   
data_df.sort_values(by='Date')
data_df.info()

In [None]:
# Plotting count plot by year
Year = data_df['Date'].dt.year

plt.figure()
ax=sns.countplot(data=data_df, x=Year)

for cont in ax.containers:
    ax.bar_label(cont)

plt.ylabel("Count", fontsize= 15)

plt.xlabel("Year", fontsize= 15)

plt.xticks(rotation=90)

plt.show()

In [None]:
# Drop single data from 1965
data_df=data_df.drop(labels=0, axis=0)
data_df

## Outliers

In [None]:
data_df.describe().T

### Plotting 'Adjusted Close vs rest'

In [None]:
fig, axes = plt.subplots(2,3, figsize =(15, 8), sharey = True)
axes = axes.ravel()
ax = axes[0]
sns.scatterplot(data=data_df, x="Date", y="Adj Close", ax = ax)

ax.set_ylabel("Adj Close")

ax.set_xlabel("Date")

ax = axes[1]
sns.scatterplot(data=data_df, x="Open", y="Adj Close", ax = ax)

ax.set_ylabel("Adj Close")

ax.set_xlabel("Open")

plt.tight_layout()

ax = axes[2]
sns.scatterplot(data=data_df, x="Close", y="Adj Close", ax = ax)

ax.set_ylabel("Adj Close")

ax.set_xlabel("Close")

plt.tight_layout()

ax = axes[3]

sns.scatterplot(data=data_df, x="High", y="Adj Close", ax = ax)

ax.set_ylabel("Adj Close")

ax.set_xlabel("High")

plt.tight_layout()

ax = axes[4]

sns.scatterplot(data=data_df, x="Low", y="Adj Close", ax = ax)

ax.set_ylabel("Adj Close")

ax.set_xlabel("Low")

plt.tight_layout()

ax = axes[5]

sns.scatterplot(data=data_df, x="Volume", y="Adj Close", ax = ax)

ax.set_ylabel("Adj Close")

ax.set_xlabel("Volume")

plt.tight_layout()

plt.show()

In [None]:
# Adj Close > 5000, Open < 1000 
display(data_df[(data_df['Adj Close']>5000)&(data_df['Open']<1000)])
        
# 2- Adj Close < 1000, Open > 5000
display(data_df[(data_df['Adj Close']<1000)&(data_df['Open']>5000)])
        
# Adj Close > 5000, Close < 1000
display(data_df[(data_df['Adj Close']>5000)&(data_df['Close']<1000)])

# 1- Adj Close > 5000, High < 1000
display(data_df[(data_df['Adj Close']>5000)&(data_df['High']<1000)])

# 2- Adj Close < 1000, High > 5000 
display(data_df[(data_df['Adj Close']<1000)&(data_df['High']>5000)])

# Adj Close > 5000, Low < 1000
display(data_df[(data_df['Adj Close']>5000)&(data_df['Low']<1000)])

In [None]:
# Assessing the 0 values of the "Volume" column
len(data_df[data_df["Volume"]==0])

Records 831, 852, 829, 833 have outliers.

In [None]:
outliers = [831, 852, 829, 833]

### Missing data

In [None]:
data_df.isnull().sum()

In [None]:
# Detecting the NAN values indexes
data_df[data_df.isnull().any(axis=1)]

In [None]:
# Dropping rows where most values are missing
data_df=data_df.dropna(thresh=3)   # 3 is picked up after looking at above data

In [None]:
data_df = data_df.ffill()

In [None]:
# Detecting the NAN values indexes
data_df[data_df.isnull().any(axis=1)]

### Removing outliers

In [None]:
data_df.iloc[outliers]

In [None]:
data_df = data_df.drop(outliers, axis = 0)

In [None]:
# Duplicated data
data_df.duplicated().sum()

In [None]:
# Also dropping Volume which as too many 0s
data_df = data_df.drop('Volume', axis = 1)

No Duplicate records!

### Visualizing Clean Data

In [None]:
# Plotting a box plot to visualize the distribution of the data in the columns
plt.figure(figsize=(10,6))
sns.boxplot(data=data_df)

plt.xticks(rotation=45, ha='right')
plt.title('Distribution of Data in Columns')
plt.show()