In [2]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [3]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
df_stocks = pd.read_csv(
    Path("tsx-energy-2018.csv"),                
    index_col="Ticker"
)

# Review the DataFrame
df_stocks.head()

Unnamed: 0_level_0,CompanyName,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,AnnualReturn,AnnualVariance,EnergyType
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ARX,ARC Resources Ltd.,13.14,13.34,12.91,13.1,1479913.38,-0.7275,0.359,Oil
CCO,Cameco Corporation,13.7,13.92,13.5,13.7,1203788.22,0.2014,0.3693,Other Energy
CNQ,Canadian Natural Resources Limited,41.97,42.46,41.46,41.91,3218248.68,-0.3461,0.2947,Oil
CVE,Cenovus Energy Inc.,11.96,12.18,11.75,11.95,4566143.56,-0.3219,0.45,Oil
CPG,Crescent Point Energy Corp.,8.53,8.67,8.36,8.5,3919414.03,-1.0103,0.4597,Other Energy


In [4]:
# Scale price data, return, and variance values
stock_data_scaled = StandardScaler().fit_transform(
    df_stocks[["MeanOpen", "MeanHigh", "MeanLow", "MeanClose", "MeanVolume", "AnnualReturn", "AnnualVariance"]]
)
stock_data_scaled 

array([[-0.91683187, -0.91721692, -0.91804499, -0.9181346 , -0.15278563,
        -1.33244548,  0.46085356],
       [-0.88015205, -0.87947182, -0.87906242, -0.87878597, -0.37911694,
         1.69574215,  0.55941139],
       [ 0.97152411,  0.97784771,  0.96831488,  0.97125524,  1.27207441,
        -0.08909231, -0.15441525],
       [-0.9941215 , -0.99270713, -0.99468868, -0.9935528 ,  2.37690243,
        -0.01020099,  1.33160722],
       [-1.21878543, -1.2211301 , -1.21867327, -1.2198074 ,  1.8467981 ,
        -2.25436545,  1.42442382],
       [ 1.06715365,  1.06440114,  1.06345878,  1.06372451,  1.72022576,
         0.38947152, -0.73619352],
       [-0.81727235, -0.8156956 , -0.81761532, -0.81713979, -0.45322872,
         0.20658712,  0.53357584],
       [-0.56378856, -0.56644775, -0.55993394, -0.56137373, -1.07340114,
         0.95018664, -0.55247407],
       [-0.54020867, -0.53976586, -0.53945157, -0.53973198, -0.28193212,
         0.0941181 ,  0.00633928],
       [ 0.8267698 ,  0.8294

In [6]:
# Create a DataFrame with the scaled data
df_stocks_scaled = pd.DataFrame(
    stock_data_scaled,
    columns=["MeanOpen", "MeanHigh", "MeanLow", "MeanClose", "MeanVolume", "AnnualReturn", "AnnualVariance"]
)
df_stocks_scaled.head()

Unnamed: 0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,AnnualReturn,AnnualVariance
0,-0.916832,-0.917217,-0.918045,-0.918135,-0.152786,-1.332445,0.460854
1,-0.880152,-0.879472,-0.879062,-0.878786,-0.379117,1.695742,0.559411
2,0.971524,0.977848,0.968315,0.971255,1.272074,-0.089092,-0.154415
3,-0.994122,-0.992707,-0.994689,-0.993553,2.376902,-0.010201,1.331607
4,-1.218785,-1.22113,-1.218673,-1.219807,1.846798,-2.254365,1.424424


In [10]:
# Copy the tickers names from the original data
df_stocks_scaled["Ticker"] = df_stocks.index
df_stocks_scaled.head()

Unnamed: 0_level_0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,AnnualReturn,AnnualVariance,Ticker
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ARX,-0.916832,-0.917217,-0.918045,-0.918135,-0.152786,-1.332445,0.460854,ARX
CCO,-0.880152,-0.879472,-0.879062,-0.878786,-0.379117,1.695742,0.559411,CCO
CNQ,0.971524,0.977848,0.968315,0.971255,1.272074,-0.089092,-0.154415,CNQ
CVE,-0.994122,-0.992707,-0.994689,-0.993553,2.376902,-0.010201,1.331607,CVE
CPG,-1.218785,-1.22113,-1.218673,-1.219807,1.846798,-2.254365,1.424424,CPG


In [11]:
df_stocks_scaled.set_index("Ticker", inplace=True)
df_stocks_scaled.head()

Unnamed: 0_level_0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,AnnualReturn,AnnualVariance
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ARX,-0.916832,-0.917217,-0.918045,-0.918135,-0.152786,-1.332445,0.460854
CCO,-0.880152,-0.879472,-0.879062,-0.878786,-0.379117,1.695742,0.559411
CNQ,0.971524,0.977848,0.968315,0.971255,1.272074,-0.089092,-0.154415
CVE,-0.994122,-0.992707,-0.994689,-0.993553,2.376902,-0.010201,1.331607
CPG,-1.218785,-1.22113,-1.218673,-1.219807,1.846798,-2.254365,1.424424


In [12]:
# Encode (convert to dummy variables) the `EnergyType` column, which categorizes oil versus non-oil firms
oil_dummies = pd.get_dummies(df_stocks["EnergyType"])
oil_dummies.head()

Unnamed: 0_level_0,Oil,Other Energy
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
ARX,1,0
CCO,0,1
CNQ,1,0
CVE,1,0
CPG,0,1


In [13]:
# Concatenate the `EnergyType` encoded dummies with the scaled data DataFrame
df_stocks_scaled = pd.concat([df_stocks_scaled, oil_dummies], axis=1)
df_stocks_scaled.head()

Unnamed: 0_level_0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,AnnualReturn,AnnualVariance,Oil,Other Energy
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ARX,-0.916832,-0.917217,-0.918045,-0.918135,-0.152786,-1.332445,0.460854,1,0
CCO,-0.880152,-0.879472,-0.879062,-0.878786,-0.379117,1.695742,0.559411,0,1
CNQ,0.971524,0.977848,0.968315,0.971255,1.272074,-0.089092,-0.154415,1,0
CVE,-0.994122,-0.992707,-0.994689,-0.993553,2.376902,-0.010201,1.331607,1,0
CPG,-1.218785,-1.22113,-1.218673,-1.219807,1.846798,-2.254365,1.424424,0,1


In [14]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

# Fit the model for the df_stocks_scaled DataFrame
model.fit(df_stocks_scaled)

KMeans(n_clusters=3)

In [15]:
# Predict the model segments (clusters)
stock_clusters = model.predict(df_stocks_scaled)
stock_clusters

array([2, 0, 1, 2, 2, 1, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 1, 1, 1,
       1, 2], dtype=int32)

In [17]:
# Create a new column in the DataFrame with the predicted clusters
df_stocks_scaled["StockCluster"] = stock_clusters
df_stocks_scaled.head()

Unnamed: 0_level_0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,AnnualReturn,AnnualVariance,Oil,Other Energy,StockCluster
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ARX,-0.916832,-0.917217,-0.918045,-0.918135,-0.152786,-1.332445,0.460854,1,0,2
CCO,-0.880152,-0.879472,-0.879062,-0.878786,-0.379117,1.695742,0.559411,0,1,0
CNQ,0.971524,0.977848,0.968315,0.971255,1.272074,-0.089092,-0.154415,1,0,1
CVE,-0.994122,-0.992707,-0.994689,-0.993553,2.376902,-0.010201,1.331607,1,0,2
CPG,-1.218785,-1.22113,-1.218673,-1.219807,1.846798,-2.254365,1.424424,0,1,2


In [18]:
# Create a scatter plot with x="AnnualVariance:,  y="AnnualReturn"
df_stocks_scaled.hvplot.scatter(
    x="AnnualVariance",
    y="AnnualReturn",
    by="StockCluster",
    hover_cols = ["Ticker"], 
    title = "Scatter Plot by Stock Segment - k=3"
)

# Step 5: To get another perspective on the clusters, reduce the number of features to two principal components by using PCA. 

In [19]:
# Create the PCA model instance where n_components=2
pca = PCA(n_components=2)

In [20]:
# Fit the df_stocks_scaled data to the PCA
stocks_pca_data = pca.fit_transform(df_stocks_scaled)

# Review the first five rose of the PCA data
# using bracket notation ([0:5])
stocks_pca_data[:5]

array([[-2.08695014,  0.89731851],
       [-1.54939224, -1.80647927],
       [ 1.8341054 ,  1.4113083 ],
       [-2.38589876,  2.17692877],
       [-3.16300968,  2.39955061]])

In [21]:
# Calculate the explained variance
pca.explained_variance_ratio_

array([0.5822138 , 0.19574623])

In [22]:
# Creating a DataFrame with the PCA data
df_stocks_pca = pd.DataFrame(stocks_pca_data, columns=["PC1", "PC2"])

# Copy the tickers names from the original data
df_stocks_pca["Ticker"] = df_stocks.index

# Set the Ticker column as index
df_stocks_pca = df_stocks_pca.set_index("Ticker")

# Review the DataFrame
df_stocks_pca.head()

Unnamed: 0_level_0,PC1,PC2
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
ARX,-2.08695,0.897319
CCO,-1.549392,-1.806479
CNQ,1.834105,1.411308
CVE,-2.385899,2.176929
CPG,-3.16301,2.399551


In [23]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

# Fit the model for the df_stocks_pca DataFrame
model.fit(df_stocks_pca)

# Predict the model segments (clusters)
stock_clusters = model.predict(df_stocks_pca)
stock_clusters

array([0, 2, 1, 0, 0, 1, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 1, 2, 0, 1, 1, 1,
       1, 0], dtype=int32)

In [24]:
# Create a copy of the df_stocks_pca DataFrame and name it as df_stocks_pca_predictions
df_stocks_pca_predictions = df_stocks_pca.copy()

# Create a new column in the DataFrame with the predicted clusters
df_stocks_pca_predictions["StockCluster"] = stock_clusters

# Review the DataFrame
df_stocks_pca_predictions.head()

Unnamed: 0_level_0,PC1,PC2,StockCluster
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ARX,-2.08695,0.897319,0
CCO,-1.549392,-1.806479,2
CNQ,1.834105,1.411308,1
CVE,-2.385899,2.176929,0
CPG,-3.16301,2.399551,0


In [25]:
# Create the scatter plot with x="PC1" and y="PC2"
df_stocks_pca_predictions.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="StockCluster",
    hover_cols = "Ticker",
    title = "Scatter Plot by Stock Segment - PCA=2"
)