In [1]:
import sklearn
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score, validation_curve, learning_curve, ValidationCurveDisplay, LearningCurveDisplay
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_parquet('data/train.parquet')
test = pd.read_parquet('data/test.parquet')

In [3]:
mlp = MLPRegressor()

In [4]:
X_train = train.drop(columns=['price'])
y_train = pd.DataFrame(train['price'])

In [5]:
y = y_train.values.ravel()

In [None]:
X_train

In [None]:
test

In [None]:
y

In [9]:
# train_sizes_abs, train_scores, test_scores, fit_times, score_times = learning_curve(estimator=mlp, X=X_train, y=y, n_jobs=-1, scoring='neg_mean_squared_error', random_state=11)
# display = LearningCurveDisplay(train_sizes=train_sizes_abs, train_scores=train_scores, test_scores=test_scores, score_name="Accuracy")
# display.plot()

In [10]:
# mlp.fit(X=X_train, y=y)

In [11]:
from sklearn.decomposition import PCA
pca = PCA(n_components=11, random_state=11, whiten=True)
X_red = pca.fit_transform(X_train)

In [None]:
# min number of components required to preserve x % training set variance
preserve_var = .9
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum>=preserve_var)+1
print(f'{d} components required to preserve {preserve_var*100}% variance')
print(cumsum)

In [None]:
variance_explained = pca.explained_variance_ratio_
cumulative_variance_explained = variance_explained.cumsum()
bars = plt.bar(range(1, len(variance_explained) + 1), variance_explained, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(1, len(variance_explained) + 1), variance_explained.cumsum(), where='mid', label='Cumulative explained variance')
for bar,i in zip(bars,enumerate(bars)):
    height = bar.get_height()
    if i[0]==0: continue
    plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{height:.2%}', ha='center', va='bottom')
for i, cumulative_variance in enumerate(cumulative_variance_explained, 1):
    plt.annotate(f'{cumulative_variance:.2%}',
                 xy=(i, cumulative_variance),
                 xytext=(0, 3),  # 3 points vertical offset
                 textcoords="offset points",
                 ha='center', va='bottom')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.title('Scree plot | red wine')
plt.xticks(np.arange(pca.n_components_)+1)
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [14]:
pca = PCA(n_components=1, random_state=11, whiten=True)
X_pca_train = pca.fit_transform(X_train)
# X_pca_test = pca.transform(test)

In [None]:
X_pca_train.shape

In [None]:
X_pca = pd.DataFrame(X_pca_train)
X_pca

In [18]:
X_pca.to_parquet('data/X_pca.parquet')

In [None]:
test_for_pca = test.drop(columns=['id'])
test_for_pca

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=11, random_state=11, whiten=True)
X_red = pca.fit_transform(test_for_pca)

# min number of components required to preserve x % training set variance
preserve_var = .9
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum>=preserve_var)+1
print(f'{d} components required to preserve {preserve_var*100}% variance')
print(cumsum)

In [31]:
pca = PCA(n_components=1, random_state=11, whiten=True)
X_pca_test = pca.fit_transform(test_for_pca)
# X_pca_test = pca.transform(test)
X_pca_test = pd.DataFrame(X_pca_test)

In [32]:
X_pca_test['id'] = test['id']

In [None]:
X_pca_test

In [None]:
X_pca_test.to_parquet('data/X_pca_test.parquet')

In [35]:
X_pca_test.to_csv('data/X_pca_test.csv')