In [24]:
import numpy as np
import pandas as pd
import plotly_express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
bean_df = pd.read_excel("./Datasets/Dry_Bean_Dataset.xlsx")

In [3]:
bean_df.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(bean_df.iloc[:,:-1], bean_df.iloc[:,-1], shuffle = True, test_size = 0.2, random_state = 42)    

In [5]:
def preprocessor(X):
    centered_x = X-np.nanmean(X, axis=0)
    return centered_x/np.nanstd(X,axis=0)

In [6]:
X_train = preprocessor(X_train)
X_test = preprocessor(X_test)

In [8]:
X_train.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
11073,-0.820047,-1.022026,-0.99169,-0.986587,-0.386834,-0.114146,-0.817952,-1.026081,-1.142598,-0.255293,0.485739,0.300833,1.344814,0.874096,0.265084,0.343163
13172,-0.50936,-0.464104,-0.569411,-0.499274,-0.303117,-0.024436,-0.504787,-0.552768,-0.881431,-0.707602,-0.572573,0.201238,0.485672,0.325515,0.16384,0.047028
11587,-0.754808,-0.950784,-1.049111,-0.675438,-0.990099,-0.933914,-0.755903,-0.921189,-0.322672,0.366882,0.7294,1.018143,0.765749,1.380299,1.016786,0.343229
12492,-0.630051,-0.791632,-0.781352,-0.614749,-0.543065,-0.29493,-0.63211,-0.729173,0.71115,0.450732,0.870363,0.478205,0.66137,0.7013,0.447282,0.419378
430,-0.583008,-0.675816,-0.797069,-0.448624,-0.812464,-0.655052,-0.58408,-0.659398,0.920037,0.245241,0.380612,0.814347,0.381755,0.902884,0.799199,0.989399


In [29]:
class PCA():
    def __init__(self):
        self.U = None
        self.L = None
    
    def fit(self, X):
        C = X.T @ X
        self.L, self.U = np.linalg.eig(C)

        idx = np.argsort(self.L)[::-1]
        self.L = np.array([self.L[i] for i in idx])
        self.U = np.array([self.U[i] for i in idx])
        return self.L, self.U

    def transform(self, X, d):
        return X @ self.U[:,:d]

In [10]:
X_train.shape

(10888, 16)

In [11]:
pca = PCA()
L, U = pca.fit(X_train)
X_pca = pca.transform(X_train,2)

(16, 16) [ 0.28270843  0.24526611 -0.06227101 -0.03012466 -0.08630606 -0.36924166
  0.12551945  0.07131181  0.03559302 -0.39749258 -0.15952794  0.05130074
  0.04624247  0.65395783  0.22557613  0.14991568]


In [12]:
px.scatter(
    x = np.arange(0,len(L),1), 
    y = np.log(L))

In [13]:
px.scatter(
    x = np.arange(0,len(L),1), 
    y = L)

In [14]:
px.scatter(
    x = X_pca[0],
    y = X_pca[1],
    color = Y_train
)

In [15]:
encoder = LabelEncoder()
Y_train_enc = encoder.fit_transform(Y_train)   
y_true = encoder.transform(Y_test) 

In [16]:
losses = []
maes = []
accs = []

for d in range(1,U.shape[1]):
    x_tr = pca.transform(X_train, d)
    x_te = pca.transform(X_test, d)
    
    clf = LogisticRegression(solver='lbfgs', max_iter=1000)
    clf.fit(x_tr, Y_train_enc)
    
    pred_prob = clf.predict_proba(x_te)
    pred_class = clf.predict(x_te)            
    
    losses.append(log_loss(y_true, pred_prob))
    maes.append(mean_absolute_error(y_true, pred_class))
    accs.append(accuracy_score(y_true, pred_class))
    
    print(d, maes[d-1], losses[d-1], accs[d-1])

1 1.0088138082996694 0.8464204752197049 0.622107969151671
2 0.3422695556371649 0.3453194803848668 0.8622842453176643
3 0.2625780389276533 0.2694472289529241 0.8968049944913699
4 0.24825560044069042 0.25471464650352327 0.9056188027910393
5 0.21410209327947116 0.22777125453464886 0.9203084832904884
6 0.20969518912963642 0.2188483927997295 0.9232464193903782
7 0.2016158648549394 0.21044681286232741 0.9269188395152406
8 0.20345207491737055 0.20987859363663502 0.9265515975027543
9 0.1990451707675358 0.20970576499454868 0.9269188395152406
10 0.20235034887991185 0.20961657725909139 0.9265515975027543
11 0.2078589790672053 0.20929675158626695 0.9250826294528094
12 0.20639001101726037 0.2090451466366229 0.9258171134777818
13 0.20455380095482922 0.2091286372192836 0.9261843554902681
14 0.20455380095482922 0.20906512206572556 0.9261843554902681
15 0.2027175908923981 0.2091112048623235 0.9265515975027543


In [17]:
px.scatter(
    x = np.arange(0, 15, 1),
    y = losses
)

In [18]:
px.scatter(
    x = np.arange(0, 15, 1),
    y = maes
)

In [19]:
px.scatter(
    x = np.arange(0, 15, 1),
    y = accs
)

In [25]:
def klein_bottle(n):
    """
    Parameters:
    n: int
        Number of points to generate"""
    
    data = np.zeros((n,3))
    u = np.random.uniform(low=0, high=np.pi, size=n)
    v = np.random.uniform(low=0, high=2*np.pi, size=n)
            
    data[:,0]=-2/15*np.cos(u)*(3*np.cos(v)+30*np.sin(u)+90*np.cos(u)**4*np.sin(u)-60*np.cos(u)**6*np.sin(u)+5*np.cos(u)*np.cos(v)*np.sin(u))
    data[:,1]=+1/15*np.sin(u)*(3*np.cos(v)+3*np.cos(u)**2*np.cos(v)-48*np.cos(u)**4*np.cos(v)+48*np.cos(u)**6*np.cos(v)-60*np.sin(u)+5*np.cos(u)*np.cos(v)*np.sin(u)-5*np.cos(u)**3*np.cos(v)*np.sin(u)-80*np.cos(u)**5*np.cos(v)*np.sin(u)+80*np.cos(u)**7*np.cos(v)*np.sin(u))
    data[:,2]=2/15*(3+5*np.cos(u)*np.sin(u))*np.sin(v)

    return data

In [26]:
X = klein_bottle(5000)

In [27]:
fig = px.scatter_3d(
    x=X[:,0],
    y=X[:,1],
    z=X[:,2],
    color=X[:,0],          # Assigns the color of the points based on this column
    opacity=0.7,                  # Adjusts the transparency of the points
    title='3D Scatter plot',
)
fig.update_traces(
    marker=dict(
        size=3,                 # Set the size of the markers (e.g., 3 is small)
    )
)

fig.update_layout( width = 800, height = 800)
fig.show()

In [39]:
klein_pca = PCA()
klein_pca.fit(X)
x_pca = klein_pca.transform(X,2)

In [40]:
fig = px.scatter(
    x=x_pca[:,0],
    y=x_pca[:,1],
    color=x_pca[:,0],          # Assigns the color of the points based on this column
    opacity=0.7,                  # Adjusts the transparency of the points
    title='2D Scatter plot',
)
fig.update_traces(
    marker=dict(
        size=3,                 # Set the size of the markers (e.g., 3 is small)
    )
)

fig.update_layout( width = 800, height = 800)
fig.show()

In [43]:
fig = px.scatter_3d(
    x=x_pca[:,0],
    y=x_pca[:,1],
    z=[0]*len(x_pca[:,0]),
    color=x_pca[:,0],          # Assigns the color of the points based on this column
    opacity=0.7,                  # Adjusts the transparency of the points
    title='3D Scatter plot',
)
fig.update_traces(
    marker=dict(
        size=3,                 # Set the size of the markers (e.g., 3 is small)
    )
)

fig.update_layout( width = 800, height = 800)
fig.show()