In [None]:
# from google.colab import files

# Upload the file from your local machine
# uploaded = files.upload()

filepath = "../data/flights.csv"

In [None]:
import pandas as pd

# Assuming the "flights" dataset is already loaded or available for upload
# Read the uploaded file into a Pandas DataFrame
# flights_data = pd.read_csv(next(iter(uploaded)), sep=",")
flights_data = pd.read_csv(filepath, sep=",")

# Display the first few rows of the DataFrame
print(flights_data.head())

# Call shape to identify the rows and columns of the dataset
print('Flights dataset has', flights_data.shape[0], 'rows and', flights_data.shape[1], 'columns')

# Get the list of features (columns)
features_list = flights_data.columns.tolist()
print("List of features:")
print(features_list)

In [None]:
#Check the data for inconsistencies
import pandas as pd

# Check for missing values in each column
missing_values = flights_data.isnull().sum()

# Identify columns with missing values
columns_with_missing_values = missing_values[missing_values > 0].index

# Print columns with missing values and their count
for column in columns_with_missing_values:
    print(f"Column '{column}' has {missing_values[column]} missing values.")



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Frequency table
origin_frequency = flights_data['origin'].value_counts()

# Print frequency table
print("Frequency Table for 'origin' feature:")
print(origin_frequency)

# Bar chart
plt.figure(figsize=(8, 6))
origin_frequency.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Bar Chart for Origin Feature')
plt.xlabel('Origin')
plt.ylabel('Frequency')
plt.xticks(rotation=0)  # Rotate x-axis labels for better readability
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Assuming the "flights" dataset is already loaded or available for upload
# Assuming the dataset is stored in the variable flights_data

# Extract the features for PCA
features = flights_data[['dep_time', 'sched_dep_time', 'dep_delay', 'arr_time',
                          'sched_arr_time', 'arr_delay', 'air_time', 'distance',
                          'hour', 'minute']]

# Impute missing values (replace NaNs with the mean of each column)
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Standardize the features
features_standardized = StandardScaler().fit_transform(features_imputed)

# Apply PCA
pca = PCA()
principal_components = pca.fit_transform(features_standardized)

# Print explained variance ratio for each principal component
print("Explained Variance Ratio:")
print(pca.explained_variance_ratio_)

# Get the eigenvalues and explained variance ratios
eigenvalues = pca.explained_variance_
explained_variance_ratio = pca.explained_variance_ratio_

# Scree Plot
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='-', color='b')
plt.axvline(x=7, color='r', linestyle='--', label='k = 6')  # Vertical line at k = 7
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')
plt.legend()
plt.grid(True)

# Cumulative Explained Variance Ratio Plot
plt.subplot(1, 2, 2)
plt.plot(range(1, len(explained_variance_ratio) + 1), np.cumsum(explained_variance_ratio), marker='o', linestyle='-', color='b')
threshold_index = np.argmax(np.cumsum(explained_variance_ratio) >= 0.95) + 1
plt.axvline(x=threshold_index, color='r', linestyle='--', label='95% Threshold')  # Vertical line at threshold
plt.title('Cumulative Explained Variance Ratio')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data

# Standardize the features
X_standardized = StandardScaler().fit_transform(X)

# Perform PCA
pca = PCA()
principal_components = pca.fit_transform(X_standardized)

# Get the eigenvalues and explained variance ratios
eigenvalues = pca.explained_variance_
explained_variance_ratio = pca.explained_variance_ratio_

# Scree Plot
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='-', color='b')
plt.axvline(x=7, color='r', linestyle='--', label='k = 7')  # Vertical line at k = 7
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')
plt.legend()
plt.grid(True)

# Cumulative Explained Variance Ratio Plot
plt.subplot(1, 2, 2)
plt.plot(range(1, len(explained_variance_ratio) + 1), np.cumsum(explained_variance_ratio), marker='o', linestyle='-', color='b')
threshold_index = np.argmax(np.cumsum(explained_variance_ratio) >= 0.95) + 1
plt.axvline(x=threshold_index, color='r', linestyle='--', label='95% Threshold')  # Vertical line at threshold
plt.title('Cumulative Explained Variance Ratio')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data

# Standardize the features
X_standardized = StandardScaler().fit_transform(X)

# Perform PCA
pca = PCA()
principal_components = pca.fit_transform(X_standardized)

# Get the coefficients for the first principal component (PC1)
coef_pc1 = pca.components_[0]

# Display the rounded coefficients
print("Coefficients for PC1:")
print(np.round(coef_pc1, 2))

# Construct the equation for PC1
equation_pc1 = "PC1 = "
for i, coef in enumerate(coef_pc1):
    equation_pc1 += f"{np.round(coef, 2)} * X{i+1} + " if i < len(coef_pc1) - 1 else f"{np.round(coef, 2)} * X{i+1}"

# Display the equation for PC1
print("\nEquation for PC1:")
print(equation_pc1)

# Substitute the first samples of each feature into the equation for PC1
first_samples = X_standardized[0, :]
pc1_value = np.dot(first_samples, coef_pc1)

# Display the result
print("\nPC1 Value for the First Sample:")
print(np.round(pc1_value, 4))

# Display the explanation for how the first eigenvalue is computed
print("\nExplanation for Computing the First Eigenvalue:")
print("The first eigenvalue is the amount of variance captured by the first principal component (PC1).")
print("It is obtained from the diagonal element of the covariance matrix of the standardized data.")
print("Mathematically, the first eigenvalue is equal to the square of the length (norm) of the first eigenvector (PC1).")
print(f"First Eigenvalue: {np.round(pca.explained_variance_[0], 4)}")



In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# Standardize the features
X_standardized = StandardScaler().fit_transform(X)

# Convert to DataFrame for better visualization (optional)
df = pd.DataFrame(data=np.c_[X_standardized, y], columns=list(breast_cancer.feature_names) + ['target'])

# Calculate the covariance matrix
cov_matrix = np.cov(X_standardized, rowvar=False)

# Display the covariance matrix
print("Covariance Matrix:")
print(cov_matrix)

# Perform eigendecomposition
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Get the coefficients for the first principal component (PC1)
coef_pc1 = eigenvectors[:, 0]

# Display the rounded coefficients for PC1
print("\nCoefficients for PC1:")
print(np.round(coef_pc1, 6))

# Calculate the sum of squares of PC1 coefficients
sum_of_squares = np.sum(coef_pc1**2)

# Number of samples
N = X_standardized.shape[0]

# Calculate the first eigenvalue
eigenvalue = (1 / N) * sum_of_squares

# Display the result
print("\nFirst Eigenvalue:")
print(np.round(eigenvalue, 6))

# Compute the sum of squares for PC1
ss_pc1 = np.sum(principal_components[:, 0]**2)

# Compute the eigenvalue for PC1
eigenvalue_pc1 = ss_pc1 / (X_standardized.shape[0] - 1)

# Display the result
print("Eigenvalue for PC1 (computed):", np.round(eigenvalue_pc1, 4))



In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
feature_names = breast_cancer.feature_names

# Create a DataFrame for the original features
features = pd.DataFrame(X, columns=feature_names)

# Standardize the features
X_standardized = StandardScaler().fit_transform(X)

# Perform PCA
pca = PCA()
principal_components = pca.fit_transform(X_standardized)

# Create a DataFrame for principal component coefficients
df_comp = pd.DataFrame(pca.components_, index=[f'PC{i+1}' for i in range(pca.n_components_)], columns=features.columns)

# Display the DataFrame
print("DataFrame with Principal Component Coefficients:")
print(df_comp)


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# Select two features for visualization
feature1_index = 0
feature2_index = 7

# Extract the selected features
X_selected = X[:, [feature1_index, feature2_index]]

# Standardize the features (important for PCA)
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X_selected)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_standardized)

# Plot the original and transformed data points side by side
fig, axs = plt.subplots(1, 2, figsize=(15, 6))

# Plot the original data points
for label in np.unique(y):
    axs[0].scatter(X_standardized[y == label, 0], X_standardized[y == label, 1], label=f'Class {label}', edgecolor='k', alpha=0.8)
axs[0].set_title('Original Data Points')
axs[0].set_xlabel(f'Feature {feature1_index + 1}')
axs[0].set_ylabel(f'Feature {feature2_index + 1}')
axs[0].legend()
axs[0].grid(True)

# Plot the transformed data points after PCA
for label in np.unique(y):
    axs[1].scatter(X_pca[y == label, 0], X_pca[y == label, 1], label=f'Class {label}', edgecolor='k', alpha=0.8)
axs[1].set_title('Transformed Data Points after PCA')
axs[1].set_xlabel('Principal Component 1')
axs[1].set_ylabel('Principal Component 2')
axs[1].legend()
axs[1].grid(True)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Assuming X_standardized, X_pca, and y are defined
# feature1_index and feature2_index are assumed to be defined as well

# Convert NumPy arrays to DataFrames
df_standardized = pd.DataFrame(X_standardized, columns=[f'Feature_{i+1}' for i in range(X_standardized.shape[1])])
df_pca = pd.DataFrame(X_pca, columns=['Principal_Component_1', 'Principal_Component_2'])

# Plot the original and transformed data points side by side with density plots
fig, axs = plt.subplots(1, 2, figsize=(15, 6))

# Plot the original data points with density plot
for label in np.unique(y):
    sns.kdeplot(data=df_standardized[y == label], x='Feature_1', y='Feature_2', label=f'Class {label}', ax=axs[0], fill=True, alpha=0.5)
    axs[0].scatter(X_standardized[y == label, 0], X_standardized[y == label, 1], label=f'Class {label}', edgecolor='k', alpha=0.8)
axs[0].set_title('Original Data Points with Density Plot')
axs[0].set_xlabel(f'Feature {feature1_index + 1}')
axs[0].set_ylabel(f'Feature {feature2_index + 1}')
axs[0].legend()
axs[0].grid(True)

# Plot the transformed data points after PCA with density plot
for label in np.unique(y):
    sns.kdeplot(data=df_pca[y == label], x='Principal_Component_1', y='Principal_Component_2', label=f'Class {label}', ax=axs[1], fill=True, alpha=0.5)
    axs[1].scatter(X_pca[y == label, 0], X_pca[y == label, 1], label=f'Class {label}', edgecolor='k', alpha=0.8)
axs[1].set_title('Transformed Data Points after PCA with Density Plot')
axs[1].set_xlabel('Principal Component 1')
axs[1].set_ylabel('Principal Component 2')
axs[1].legend()
axs[1].grid(True)

plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# Select two features for visualization
feature1_index = 0
feature2_index = 7

# Extract the selected features
X_selected = X[:, [feature1_index, feature2_index]]

# Standardize the features (important for PCA)
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X_selected)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_standardized)

# Convert NumPy arrays to DataFrames
df_original = pd.DataFrame(X_selected, columns=[f'Feature_{i+1}' for i in range(X_selected.shape[1])])
df_standardized = pd.DataFrame(X_standardized, columns=[f'Feature_{i+1}' for i in range(X_standardized.shape[1])])
df_pca = pd.DataFrame(X_pca, columns=['Principal_Component_1', 'Principal_Component_2'])

# Plot the original and transformed data points side by side with density plots
fig, axs = plt.subplots(2, 2, figsize=(12, 9))

# Plot the original data points
for label in np.unique(y):
    axs[0, 0].scatter(X_standardized[y == label, 0], X_standardized[y == label, 1], label=f'Class {label}', edgecolor='k', alpha=0.8)
axs[0, 0].set_title('Original Data Points')
axs[0, 0].set_xlabel(f'Feature {feature1_index + 1}')
axs[0, 0].set_ylabel(f'Feature {feature2_index + 1}')
axs[0, 0].legend()
axs[0, 0].grid(True)

# Plot the transformed data points after PCA
for label in np.unique(y):
    axs[0, 1].scatter(X_pca[y == label, 0], X_pca[y == label, 1], label=f'Class {label}', edgecolor='k', alpha=0.8)
axs[0, 1].set_title('Transformed Data Points after PCA')
axs[0, 1].set_xlabel('Principal Component 1')
axs[0, 1].set_ylabel('Principal Component 2')
axs[0, 1].legend()
axs[0, 1].grid(True)

# Plot density plots for original data with both features
for label in np.unique(y):
    sns.kdeplot(data=df_original[y == label], x='Feature_1', y='Feature_2', ax=axs[1, 0], fill=True, alpha=0.5, label=f'Class {label}')
axs[1, 0].set_title('Density Plot of Original Data (Features 1 and 8)')
axs[1, 0].set_xlabel(f'Feature {feature1_index + 1}')
axs[1, 0].set_ylabel(f'Feature {feature2_index + 1}')
axs[1, 0].grid(True)

# Assuming X_original is the original dataset you want to compare
# Define the feature_indices you want to compare
feature_indices = [0, 1]

# Original Data
df_original = pd.DataFrame(X[:, feature_indices], columns=[f'Feature_{i+1}' for i in feature_indices])

# Standardize the data for comparison
scaler_comparison = StandardScaler()
X_standardized_comparison = scaler_comparison.fit_transform(X[:, feature_indices])
df_standardized_comparison = pd.DataFrame(X_standardized_comparison, columns=[f'Feature_{i+1}' for i in feature_indices])

# Plot density plots for standardized data for comparison
for label in np.unique(y):
    sns.kdeplot(data=df_standardized_comparison[y == label], x='Feature_1', y='Feature_2', ax=axs[1, 1], fill=True, alpha=0.5, label=f'Class {label}')
axs[1, 1].set_title('Density Plot of Standardized Data for Comparison')
axs[1, 1].set_xlabel(f'Feature {feature1_index + 1}')
axs[1, 1].set_ylabel(f'Feature {feature2_index + 1}')
axs[1, 1].grid(True)

# Add legends separately
axs[1, 0].legend()
axs[1, 1].legend()

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
feature_names = breast_cancer.feature_names

# Standardize the features
X_standardized = StandardScaler().fit_transform(X)

# Perform PCA with 7 components
num_components = 7
pca = PCA(n_components=num_components)
principal_components = pca.fit_transform(X_standardized)

# Create DataFrames for the coefficients of the first 7 principal components
df_coef_pcs = pd.DataFrame(pca.components_[:num_components].T, columns=[f'PC{i+1}' for i in range(num_components)], index=feature_names)

# Bar Plots for the first 7 PCs
plt.figure(figsize=(15, 10))
for i in range(num_components):
    plt.subplot(2, 4, i+1)
    sns.barplot(x=df_coef_pcs[f'PC{i+1}'], y=df_coef_pcs.index, hue=df_coef_pcs.index, palette='viridis', legend=False)
    plt.title(f'Principal Component {i+1}')

plt.tight_layout()
plt.show()


In [None]:
print(df_coef_pcs)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Breast Cancer dataset
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target
feature_names = breast_cancer.feature_names

# Standardize the features
X_standardized = StandardScaler().fit_transform(X)

# Perform PCA with 7 components
pca = PCA(n_components=7)
principal_components = pca.fit_transform(X_standardized)

# Create a DataFrame combining PCs and target feature
df_pcs = pd.DataFrame(data=np.c_[principal_components, y], columns=[f'PC{i+1}' for i in range(7)] + ['target'])

# Plot bar plot for mean values
plt.figure(figsize=(12, 6))
sns.barplot(
    data=df_pcs.groupby('target').mean().stack().reset_index(name='value'),
    x='level_1',
    y='value',
    hue='target',
    palette='viridis'
)
plt.title('Mean Values of Principal Components for Malignant and Benign Tumors')
plt.xlabel('Principal Components')
plt.ylabel('Mean Value')
plt.legend(title='Target', labels=['Malignant (0.0)', 'Benign (1.0)'])
plt.show()

# Plot bar plot for standard deviation values
plt.figure(figsize=(12, 6))
sns.barplot(
    data=df_pcs.groupby('target').std().stack().reset_index(name='value'),
    x='level_1',
    y='value',
    hue='target',
    palette='viridis'
)
plt.title('Standard Deviation Values of Principal Components for Malignant and Benign Tumors')
plt.xlabel('Principal Components')
plt.ylabel('Standard Deviation')
plt.legend(title='Target', labels=['Malignant (0.0)', 'Benign (1.0)'])
plt.show()


In [None]:
print(df_pcs.groupby('target').std().stack().reset_index(name='value'))