In [1]:
'''Random Projection to reduce dimensions
Gaussian and Sparse'''
# Importing the needed libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import numpy as np
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings('ignore')
import statsmodels.formula.api as smf
from sklearn.preprocessing import add_dummy_feature
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeRegressor
from scipy.spatial.transform import Rotation

In [5]:
from sklearn.random_projection import johnson_lindenstrauss_min_dim
m,eps = 6000,0.1
dims = johnson_lindenstrauss_min_dim(m,eps=eps)
dims

7456

In [7]:
# the exact equation by johnson_lindenstrauss_min_dim
dims_eq = int(4 * np.log(m) / (eps ** 2 / 2 - eps ** 3 / 3))
dims_eq

7456

In [35]:
n=20000
M = (1/np.sqrt(dims))*np.random.randn(dims,n)
# Now generating random dataset
x = np.random.randn(m,n) # data original
x_reduced = x @ M.T # data projected
x_reduced.shape
"""Conclusion: Thus the data is now still 6000 instances 
but reduced to 7456 dimentions instead of 20000!"""

(6000, 7456)

In [36]:
x.shape

(6000, 20000)

In [37]:
# or we can use GaussianRandomProjection to do same:
from sklearn.random_projection import GaussianRandomProjection
gr_projection = GaussianRandomProjection(eps=eps)
x_reduced2 = gr_projection.fit_transform(x) # same as x_reduced

In [40]:
# to recover x
# first find the inverse of the matrix using SVD
comp_pinv = np.linalg.pinv(gr_projection.components_)
x_original = x_reduced @ comp_pinv.T

In [42]:
"""The main difference is that the random matrix the 
SparseRandomProjection generates is sparse.  it uses
much less memory and much faster"""
from sklearn.random_projection import SparseRandomProjection
sp_proj = SparseRandomProjection(eps=eps)
x_reduced3 = sp_proj.fit_transform(x)

In [None]:
"""Conclusion
Better to use SparseRandomProjection especially
for large or sparse data sets"""