# Singular Value Decomposition (SVD):
AX = B
- `A` is a coefficient matrix
- `X` is a variable matrix
- `B` is a constant matrix

`A = UΣV^T` 
- `U` is an m × m orthogonal matrix. 
- `V` is an n × n orthogonal matrix. 
- `Σ` is an m × n matrix whose ith diagonal entry equals the ith singular value σi for i = 1,...,r. All other entries of Σ are zero.

In [1]:
# Importing libraries for svd:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [3]:
# Load dataset:
titanic_df = sns.load_dataset('titanic')
# Drop columns:
titanic_df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [5]:
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [8]:
# Process the data 
# Selecting features for SVD
featuers = ['pclass', 'sex', 'age', 'fare', 'survived'] # features to be used for SVD

# Pipeline for numerical features
numeric_features = ['age', 'fare']                 # Selecting numerical features
numeric_transformer = Pipeline(steps=[             # Steps in the pipeline
    ('imputer', SimpleImputer(strategy='median')), # Replace missing values with median
    ('scaler', StandardScaler())])                 # Scale the data

# Pipeline for categorical features
categorical_features = ['pclass', 'sex']           # Selecting categorical features
categorical_transformer = Pipeline(steps=[         # Steps in the pipeline
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')) # Replace missing values with 'missing'
    ,('onehot', OneHotEncoder(handle_unknown='ignore'))])                 # One-hot encode the data

# Preprocessor 
preprocessor = ColumnTransformer(     # ColumnTransformer for numerical and categorical features 
    transformers=[                    # transformers for numerical and categorical features
        ('num', numeric_transformer, numeric_features),   # numerical features
        ('cat', categorical_transformer, categorical_features)])  # categorical features

# Applying Preprocessing to the dataset
X = preprocessor.fit_transform(titanic_df[featuers]) # Preprocessing the data
print(X)                                             # Printing the preprocessed data


[[-0.56573646 -0.50244517  0.         ...  1.          0.
   1.        ]
 [ 0.66386103  0.78684529  1.         ...  0.          1.
   0.        ]
 [-0.25833709 -0.48885426  0.         ...  1.          1.
   0.        ]
 ...
 [-0.1046374  -0.17626324  0.         ...  1.          1.
   0.        ]
 [-0.25833709 -0.04438104  1.         ...  0.          0.
   1.        ]
 [ 0.20276197 -0.49237783  0.         ...  1.          0.
   1.        ]]


- X is a numpy array

In [10]:
# Printing the original data
print(titanic_df[featuers].head()) 
print(titanic_df[featuers].shape) # Printing the shape of the original data


   pclass     sex   age     fare  survived
0       3    male  22.0   7.2500         0
1       1  female  38.0  71.2833         1
2       3  female  26.0   7.9250         1
3       1  female  35.0  53.1000         1
4       3    male  35.0   8.0500         0
(891, 5)


In [7]:
X.shape # Shape of the data

(891, 7)

- output shows that this is a `matrix` with `891 rows` and `7 columns`

In [17]:
# Perform SVD
svd = TruncatedSVD(n_components=5) # Creating an instance of TruncatedSVD
Y = svd.fit(X)                         # Fitting the data to the model

print("Singular values: ", svd.singular_values_) # Singular values of the data 
print("Percentage of variance explained: ", svd.explained_variance_ratio_) # Percentage of variance explained

Singular values:  [34.0919237  29.83381513 28.43241553 18.6007055  15.29302117]
Percentage of variance explained:  [0.37544109 0.16349524 0.21253133 0.12019999 0.08303783]


In [16]:
import matplotlib.pyplot as plt

# Assuming Y is the output of SVD
plt.figure(figsize=(10, 5)) # Setting the figure size
plt.scatter(Y[:, 0], Y[:, 1], alpha=0.5) # Plotting the data
plt.xlabel('SVD Component 1') # Setting the label for x-axis
plt.ylabel('SVD Component 2') # Setting the label for y-axis
plt.title('SVD on Titanic Dataset') # Setting the title of the plot
plt.grid(True) # Showing the grid
plt.show() # Showing the plot

TypeError: 'TruncatedSVD' object is not subscriptable

<Figure size 1000x500 with 0 Axes>

# Interpretation:
