[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/niteshjindal170988/unsupervised-learning/blob/main/dimensionality-reduction/principal_component_analysis_digit_recognizer.ipynb)

# Imports 

In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt

In this lesson, we will look through the  [Higgs Boson Dataset] (https://www.kaggle.com/competitions/higgs-boson/data) and visualize the features to understand the data distribution. 

# Download the Higgs Boson Dataset 

In [None]:
!pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org gdown==4.2.0
import gdown
warnings.filterwarnings("ignore")
# Load Higgs Boson Data 
url = 'https://drive.google.com/uc?id=1Xu5mHENHoiyUbVG_9Z1bza-N0JfutjaP'
output = 'training.csv'
gdown.download(url, output, quiet=False, verify=False)

# Read the training.csv data
<br>
There are 250000 observations and 33 features in the data. There are two unique labels, viz. "s" and "b".

In [None]:
higgsboson = pd.read_csv("training.csv")
display(higgsboson.shape) # shape of higgs boson data
display(higgsboson.head()) # top few rows of the higgs boson data
display(higgsboson.Label.unique())

# Histogram Plot of Higgs Boson Data Features excluding the Class Labels
<br>
we do not require "EventId" feature and "Label" features, therefore we will remove these features and plot rest other features from the dataset. 

In [None]:
tot_cols_to_plot=[elem for elem in higgsboson.columns.tolist() if elem not in ("EventId", "Label")] 

for i in range(len(tot_cols_to_plot)):
    print("Histogram plot of " + tot_cols_to_plot[i])
    figure = plt.figure(figsize=(12,6))
    plt.hist(higgsboson[tot_cols_to_plot[0]], stacked = True, bins = 10, color = ['orange'])
    plt.ylabel('Frequency')
    plt.show()

# Histogram Plot of  Higgs Boson Data Features considering the Class Labels
<br>
Histogram Plots of each dimension in the Higgs Boson data corresponding to each class label, viz. "s" and "b".

##  Two Plots For Each Feature (Each Plot with One Class Label)

In [None]:
for col in range(len(tot_cols_to_plot)):
    for i in range(len(unq_labels)):
        plt.subplots(figsize=(15, 5))
        plt.subplot(1, 2, 1)
        subset_higgs_boson_data =  higgsboson[higgsboson['Label'] == unq_labels[i]]
        print("Histogram plots of features in subset Higgs Boson Data with label - " + subset_higgs_boson_data['Label'].unique()[0])
        print("Histogram plot of " + tot_cols_to_plot[col])
        plt.hist(subset_higgs_boson_data[tot_cols_to_plot[col]], 
                 stacked = True, 
                 bins = 10,
                 color = ['blue'])
        plt.ylabel('Frequency')
        plt.show()

##  One Plots For Each Feature (Each Plot with Both Class Labels)

In [None]:
unq_labels= higgsboson.Label.unique()

for col in range(len(tot_cols_to_plot)):
    higgs_boson_grpby_lbls = higgsboson.groupby('Label', as_index = False, level = None, axis = 0)
    print("Histogram plots of features in subset of Higgs Boson Data with labels {} and {}".format(unq_labels[0], unq_labels[1])) 
    print("Histogram plot of " + tot_cols_to_plot[col])
    fig, ax = plt.subplots(figsize=(15,5))
    for key, val in higgs_boson_grpby_lbls:
        
        plt.hist(val[tot_cols_to_plot[col]],
                     stacked = True, 
                     bins = 10,linewidth=2, linestyle=':',
                     label=key)
    ax.legend()
    plt.show()
