# PCA

## Function to perform PCA on a single data set


In [None]:
def perform_pca_on_wine_dataset(file_path, n_components):
    """
    Performs PCA on the Wine Quality Dataset.

    Parameters:
    file_path (str): Path to the wine dataset file.
    n_components (int): Number of principal components to retain.

    Returns:
    np.ndarray: Transformed dataset using the first two principal components.
    """

    # Load the dataset into a pandas dataframe
    data = pd.read_csv(file_path, sep=';')
    numeric_columns = data.select_dtypes(include=[np.number])
    X = numeric_columns.values

    # Normalize the data
    # Subtract the mean and divide by the standard deviation for each column
    normalized_data = (X - X.mean()) / X.std()

    # Compute the covariance matrix
    covariance_matrix = np.cov(normalized_data.T)

    # Calculate the eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

    # Sort the eigenvectors by decreasing eigenvalues
    # The eigenvalues are sorted in descending order, and the eigenvectors are rearranged accordingly
    idx = eigenvalues.argsort()[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]

    # Select the first n principal components
    selected_components = eigenvectors[:, :n_components]

    # Calculate the explanatory variance
    # The variance explained by each principal component is the eigenvalue divided by the sum of all eigenvalues
    explained_variance = eigenvalues[:n_components] / eigenvalues.sum()
    # Transform the data with the first two principal components
    transformed_data = np.dot(normalized_data, selected_components)

    return transformed_data, explained_variance



# Similarly, perform PCA on the white wine dataset
# Replace 'path_to_white_wine.csv' with the actual path to the white wine dataset
white_wine_results = perform_pca_on_wine_dataset('winequality-white.csv', 2)
transformed_white_wine = white_wine_results[0]
explained_variance_white_wine = white_wine_results[1]


### Usage on red wine 

In [None]:
# Replace 'path_to_red_wine.csv' with the actual path to the red wine dataset
red_wine_results = perform_pca_on_wine_dataset('path_to_red_wine.csv', 2)
transformed_red_wine = red_wine_results[0]
explained_variance_red_wine = red_wine_results[1]

### Similarly, perform PCA on the white wine dataset

In [None]:
# Replace 'path_to_white_wine.csv' with the actual path to the white wine dataset
white_wine_results = perform_pca_on_wine_dataset(('path_to_white_wine.csv', 2)
transformed_white_wine = white_wine_results[0]
explained_variance_white_wine = white_wine_results[1]

### Red wine PCA 2 components plot

In [None]:
plt.figure(figsize=(12, 6))

# Plotting the transformed red wine data
plt.subplot(1, 2, 1)
plt.scatter(transformed_red_wine[:, 0], transformed_red_wine[:, 1], color='red', alpha=0.5)
plt.title('Red Wine - First Two Principal Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

### White wine PCA 2 components plot

In [None]:
plt.subplot(1, 2, 2)
plt.scatter(transformed_white_wine[:, 0], transformed_white_wine[:, 1], color='green', alpha=0.5)
plt.title('White Wine - First Two Principal Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

plt.show()

## Function to project whtie wine data into red wine PCA

In [None]:
def project_and_plot(red_wine_file, white_wine_file, n_components):
    # Load the datasets
    red_wine = pd.read_csv(red_wine_file, sep=';')
    white_wine = pd.read_csv(white_wine_file, sep=';')

    # Normalize the red wine data
    red_wine_normalized = (red_wine - red_wine.mean()) / red_wine.std()

    # Compute PCA on the red wine data
    cov_matrix_red = np.cov(red_wine_normalized.T)
    eigenvalues_red, eigenvectors_red = np.linalg.eig(cov_matrix_red)
    eigenvectors_red = eigenvectors_red[:, np.argsort(-eigenvalues_red)][:, :n_components]

    # Normalize the white wine data using red wine's mean and std
    white_wine_normalized = (white_wine - red_wine.mean()) / red_wine.std()

    # Project the white wine data onto the red wine PCA components
    transformed_white_wine = np.dot(white_wine_normalized, eigenvectors_red)
    transformed_red_wine = np.dot(red_wine_normalized, eigenvectors_red)

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.scatter(transformed_red_wine[:, 0], transformed_red_wine[:, 1], color='red', alpha=0.5, label='Red Wine')
    plt.scatter(transformed_white_wine[:, 0], transformed_white_wine[:, 1], color='green', alpha=0.5, label='White Wine')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('Red and White Wine PCA Comparison')
    plt.legend()
    plt.show()

### Usage 

In [None]:
project_and_plot('path_to_red_wine.csv', 'path_to_white_wine.csv', 2)


## Function to merge and plot combined PCA

In [None]:
def pca_and_plot_combined(red_wine_file, white_wine_file, n_components):
    """
    Performs PCA on the combined red and white wine datasets and plots the results.

    Parameters:
    red_wine_file (str): Path to the red wine dataset file.
    white_wine_file (str): Path to the white wine dataset file.
    n_components (int): Number of principal components to retain.

    Returns:
    None: This function plots the PCA results.
    """

    # Load the datasets
    red_wine = pd.read_csv(red_wine_file, sep=';')
    white_wine = pd.read_csv(white_wine_file, sep=';')

    # Optionally, you can add a column to each dataset to label the wine type
    red_wine['wine_type'] = 'red'
    white_wine['wine_type'] = 'white'

    # Combine the datasets
    combined_data = pd.concat([red_wine, white_wine])

    # Normalize the combined data
    # Exclude the wine type label for normalization
    features = combined_data.select_dtypes(include=[np.number])
    normalized_features = (features - features.mean()) / features.std()

    # Compute PCA on the combined data
    covariance_matrix = np.cov(normalized_features.T)
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
    eigenvectors = eigenvectors[:, np.argsort(-eigenvalues)][:, :n_components]

    # Transform the data
    transformed_data = np.dot(normalized_features, eigenvectors)

    # Plotting the transformed data
    plt.figure(figsize=(10, 6))
    plt.scatter(transformed_data[combined_data['wine_type'] == 'red', 0], transformed_data[combined_data['wine_type'] == 'red', 1], color='red', alpha=0.5, label='Red Wine')
    plt.scatter(transformed_data[combined_data['wine_type'] == 'white', 0], transformed_data[combined_data['wine_type'] == 'white', 1], color='green', alpha=0.5, label='White Wine')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('PCA of Combined Red and White Wine Datasets')
    plt.legend()
    plt.show()


### Example usage

In [None]:
pca_and_plot_combined('path_to_red_wine.csv', 'path_to_white_wine.csv', 2)

**Applying PCA Separately to Each Dataset**:

Strategy: Perform PCA independently on the red wine and white wine datasets. This means computing the principal components for each dataset separately.Observations:

- This approach allows for the identification of features or patterns unique to each type of wine.
- The principal components for each dataset represent the variance and characteristics specific to that dataset.

Comparing the PCA results of the two datasets can highlight differences in the underlying factors that differentiate red and white wines.

**Combining Datasets and Applying PCA Jointly**:

Strategy: Merge the red and white wine datasets into a single dataset and then perform PCA on this combined dataset.
Observations:

- This approach provides a holistic view of the variance and features across both red and white wines.
- The principal components derived from the combined dataset capture the variance that is most significant across all wines, not just within a specific type.

By plotting the PCA results, one can observe how red and white wines cluster or spread in the same feature space, offering insights into similarities and differences between them.

Key Differences and Considerations:

**Uniqueness vs. Commonality**: Separate PCA allows for the identification of unique characteristics within each type of wine, while combined PCA emphasizes commonalities and overarching patterns across both types.

**Data Dominance**: In the combined approach, if one type of wine (red or white) has more samples or greater variance, it might dominate the PCA results, potentially skewing the interpretation.

**Interpretability**: Separate PCA might be easier to interpret, as the components are specific to one type of wine. In contrast, components from the combined PCA might require more careful interpretation, as they represent a mixture of features from both red and white wines.
Use Case: The choice of strategy depends on the analytical goal. If the goal is to compare and contrast red and white wines, the combined approach might be more suitable. If the goal is to understand each wine type in depth, separate PCA would be more appropriate.