In [1]:
import numpy as np
import pandas as pd

np.random.seed(23)

mu_vec1 = np.array([0, 0, 0])
cov_mat1 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20)

df = pd.DataFrame(class1_sample, columns=["feature1", "feature2", "feature3"])
df["target"] = 1

mu_vec2 = np.array([1, 1, 1])
cov_mat2 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20)



In [2]:
df

Unnamed: 0,feature1,feature2,feature3,target
0,0.666988,0.025813,-0.777619,1
1,0.948634,0.701672,-1.051082,1
2,-0.367548,-1.13746,-1.322148,1
3,1.772258,-0.347459,0.67014,1
4,0.322272,0.060343,-1.04345,1
5,-1.009942,0.441736,1.128877,1
6,-1.838068,-0.938769,-0.201841,1
7,1.045371,0.538162,0.812119,1
8,0.241106,-0.95251,-0.136267,1
9,1.267248,0.173634,-1.223255,1


In [3]:
df.head()

Unnamed: 0,feature1,feature2,feature3,target
0,0.666988,0.025813,-0.777619,1
1,0.948634,0.701672,-1.051082,1
2,-0.367548,-1.13746,-1.322148,1
3,1.772258,-0.347459,0.67014,1
4,0.322272,0.060343,-1.04345,1


In [5]:
df.describe()

Unnamed: 0,feature1,feature2,feature3,target
count,20.0,20.0,20.0,20.0
mean,0.13618,-0.08728,0.005184,1.0
std,1.150653,0.69222,0.813433,0.0
min,-2.50623,-1.632386,-1.322148,1.0
25%,-0.380872,-0.548838,-0.703969,1.0
50%,0.215624,0.043078,-0.082679,1.0
75%,0.972818,0.44573,0.684824,1.0
max,1.968435,1.040886,1.192404,1.0


In [6]:
print(df.dtypes)

feature1    float64
feature2    float64
feature3    float64
target        int64
dtype: object


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df.iloc[:, :3] = scaler.fit_transform(df.iloc[:, :3])

In [11]:
df.describe()

Unnamed: 0,feature1,feature2,feature3,target
count,20.0,20.0,20.0,20.0
mean,1.1102230000000002e-17,5.967449000000001e-17,0.0,1.0
std,1.025978,1.025978,1.025978,0.0
min,-2.356102,-2.290087,-1.674155,1.0
25%,-0.4610287,-0.6841003,-0.894449,1.0
50%,0.07083614,0.1932116,-0.110821,1.0
75%,0.7459875,0.7900046,0.857226,1.0
max,1.633728,1.672118,1.497434,1.0


In [13]:
covariance_matrix = np.cov([df.iloc[:, 0], df.iloc[:, 1], df.iloc[:, 2]])
covariance_matrix

array([[ 1.05263158,  0.20397591, -0.28888004],
       [ 0.20397591,  1.05263158,  0.10956124],
       [-0.28888004,  0.10956124,  1.05263158]])

In [15]:
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)
eigen_values

array([0.64212617, 1.36120658, 1.15456198])

In [16]:
eigen_vectors

array([[-0.65172443,  0.74834128,  0.12345283],
       [ 0.48046517,  0.28140349,  0.8306415 ],
       [-0.58686326, -0.60066414,  0.54294945]])

In [19]:
pc = eigen_vectors[:2]

In [20]:
transformed_df = np.dot(df.iloc[:, 0:3], pc.T)

In [21]:
new_df = pd.DataFrame(transformed_df, columns=["PC1", "PC2"])
new_df["target"] = df["target"].values
new_df.head()

Unnamed: 0,PC1,PC2,target
0,-0.304909,-0.545559,1
1,0.238477,-0.429512,1
2,-1.078773,-2.044435,1
3,-1.135779,1.289053,1
4,-0.107685,-0.957342,1


In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

lr1 = LinearRegression()
np.mean(cross_val_score(lr1, df.iloc[:,:-1], df.iloc[:,-1],scoring='accuracy', cv=10))

1.0

In [25]:
np.mean(
    cross_val_score(
        lr1, new_df.iloc[:, :-1], new_df.iloc[:, -1], scoring="accuracy", cv=10
    )
)

1.0