In [4]:
import pandas as pd
import numpy as np

In [2]:
# Load the California Housing dataset
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()
data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
data['PRICE'] = california_housing.target

In [21]:
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
#histogram for every feature of the dataframe 
for x in data.columns:
  data.hist(column = x, bins = 100)

In [None]:
from matplotlib import pyplot as plt

for x in data.columns:
  data.boxplot(column = x)
  plt.show()

In [None]:
import seaborn as sns
corr = data.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [75]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = data.drop('PRICE', axis=1)
y = data['PRICE']
X_train, X_test, y_train, y_test = train_test_split(
X,y , random_state=51,test_size=0.3, shuffle=True)

In [61]:
#no effect 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [49]:
#no effect
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
from sklearn.metrics import mean_squared_error, r2_score , mean_absolute_error
from sklearn.linear_model import RidgeCV

# Create and train the Ridge regression model using cross validation for alphaParameters
ridge_model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10], cv=5)
ridge_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = ridge_model.predict(X_test)

# Evaluate the model's performance using mse-mae-
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

mae = mean_absolute_error(y_test, y_pred)
print("mae",mae)

mpe = np.mean((y_test - y_pred) / y_test) * 100
print(f'Mean Percentage Error: {mpe}%')

Mean Squared Error: 0.511754591302096
R-squared: 0.6265779297957941
mae 0.5290997101264686
Mean Percentage Error: -12.375047553424217%


In [6]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

#logistic regression

model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

y_pred = model_linear.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {:.2f}".format(mse))

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
mpe = np.mean((y_test - y_pred) / y_test) * 100
print(f'Mean Percentage Error: {mpe}%')

Mean Squared Error: 0.51
R-squared: 0.6267053027976143
Mean Percentage Error: -12.352972275033373%


In [7]:
#Desicion tree
from sklearn.tree import DecisionTreeRegressor

clf = DecisionTreeRegressor(max_depth=10, random_state=0)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(mse)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
mpe = np.mean((y_test - y_pred) / y_test) * 100
print(f'Mean Percentage Error: {mpe}%')

0.42646625737274646
R-squared: 0.6888119512612991
Mean Percentage Error: -8.428045795176734%


In [8]:
#random forest 
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(n_estimators=150,max_depth=20,random_state=1)
#forest_model.fit(X_train, y_train)
# Perform cross-validation
cv_scores = cross_val_score(forest_model, X_train, y_train, cv=5)  # 5-fold cross-validation
forest_model.fit(X_train, y_train)
# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
y_pred = forest_model.predict(X_test)

print("score",forest_model.score(X_test, y_test))
print(mean_absolute_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
print(mse)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
mpe = np.mean((y_test - y_pred) / y_test) * 100
print(f'Mean Percentage Error: {mpe}%')

Cross-validation scores: [0.79516559 0.79419588 0.81325182 0.79599414 0.79229343]
score 0.81636571371345
0.3311656086063108
0.2516607791184781
R-squared: 0.81636571371345
Mean Percentage Error: -7.598657534739527%


In [10]:
from sklearn.ensemble import ExtraTreesRegressor

#extra trees
reg = ExtraTreesRegressor(n_estimators=150,max_depth=20, random_state=1).fit(X_train, y_train)
cv_scores = cross_val_score(reg, X_train, y_train, cv=5)
# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
y_pred = reg.predict(X_test)


print("score",reg.score(X_test, y_test))
print("mae",mean_absolute_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
print("mse",mse)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
mpe = np.mean((y_test - y_pred) / y_test) * 100
print(f'Mean Percentage Error: {mpe}%')

Cross-validation scores: [0.79643416 0.7942357  0.815571   0.79514921 0.79313044]
score 0.8174820231372872
mae 0.3332855056374472
mse 0.2501309379051559
R-squared: 0.8174820231372872
Mean Percentage Error: -7.597100573408428%


In [11]:
#xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Assuming you have X_train, X_test, y_train, and y_test defined

# Create an XGBoost regressor with early stopping parameters
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)

# Perform cross-validation (K-Fold is commonly used for regression tasks)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(my_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Fit the model on the entire training set
my_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Make predictions on the test set
y_pred = my_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
print("score",reg.score(X_test, y_test))
print("mae",mean_absolute_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
print("mse",mse)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
mpe = np.mean((y_test - y_pred) / y_test) * 100
print(f'Mean Percentage Error: {mpe}%')


Cross-validation scores: [-0.2081444  -0.21046931 -0.22705656 -0.21440367 -0.20744022]
Mean Squared Error: 0.2056942835789989
R-squared: 0.8499069934911473
score 0.8174820231372872
mae 0.29648786117945175
mse 0.2056942835789989
R-squared: 0.8499069934911473
Mean Percentage Error: -5.279803034238677%


In [13]:
# CatBoostRegressor
from catboost import CatBoostRegressor
catboost_model = CatBoostRegressor(iterations=1000, learning_rate=0.2, depth=6, loss_function='RMSE', random_state=42)

# Fit the model on the training set
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=10)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
print(f'R-squared: {r2}')
print("score",reg.score(X_test, y_test))
print("mae",mean_absolute_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
print("mse",mse)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
mpe = np.mean((y_test - y_pred) / y_test) * 100
print(f'Mean Percentage Error: {mpe}%')

0:	learn: 1.0254804	test: 1.0441380	best: 1.0441380 (0)	total: 3.4ms	remaining: 3.39s
10:	learn: 0.6258168	test: 0.6241110	best: 0.6241110 (10)	total: 37.9ms	remaining: 3.41s
20:	learn: 0.5548763	test: 0.5543612	best: 0.5543612 (20)	total: 73.3ms	remaining: 3.42s
30:	learn: 0.5259671	test: 0.5294141	best: 0.5294141 (30)	total: 108ms	remaining: 3.38s
40:	learn: 0.5079382	test: 0.5171702	best: 0.5171702 (40)	total: 146ms	remaining: 3.41s
50:	learn: 0.4895343	test: 0.5048311	best: 0.5048311 (50)	total: 182ms	remaining: 3.38s
60:	learn: 0.4749014	test: 0.4954642	best: 0.4954642 (60)	total: 217ms	remaining: 3.34s
70:	learn: 0.4629164	test: 0.4891895	best: 0.4891895 (70)	total: 253ms	remaining: 3.31s
80:	learn: 0.4523644	test: 0.4842838	best: 0.4842838 (80)	total: 290ms	remaining: 3.29s
90:	learn: 0.4433066	test: 0.4799770	best: 0.4799770 (90)	total: 326ms	remaining: 3.26s
100:	learn: 0.4347547	test: 0.4758056	best: 0.4758056 (100)	total: 360ms	remaining: 3.21s
110:	learn: 0.4250668	test: 0.

In [52]:
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Assuming you have X and y defined
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a CatBoostRegressor
catboost_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=10, loss_function='RMSE', random_state=42)

# Specify the dataset format for CatBoost
train_pool = Pool(X_train, label=y_train)
test_pool = Pool(X_test, label=y_test)

# Perform cross-validation
cv_results = cv(params=catboost_model.get_params(), pool=train_pool, fold_count=5, type='TimeSeries')

# Print the cross-validation results
print("Cross-validation results:", cv_results)

# Fit the model on the entire training set
catboost_model.fit(train_pool, eval_set=test_pool, verbose=100)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
mpe = np.mean((y_test - y_pred) / y_test) * 100
print(f'Mean Percentage Error: {mpe}%')

Training on fold [0/5]
0:	learn: 2.2792848	test: 2.2749435	best: 2.2749435 (0)	total: 23.5ms	remaining: 23.4s
1:	learn: 2.1921876	test: 2.1904661	best: 2.1904661 (1)	total: 40.5ms	remaining: 20.2s
2:	learn: 2.1011529	test: 2.1027204	best: 2.1027204 (2)	total: 56.7ms	remaining: 18.8s
3:	learn: 2.0244885	test: 2.0300001	best: 2.0300001 (3)	total: 73.1ms	remaining: 18.2s
4:	learn: 1.9476531	test: 1.9563259	best: 1.9563259 (4)	total: 89.8ms	remaining: 17.9s
5:	learn: 1.8764108	test: 1.8886781	best: 1.8886781 (5)	total: 107ms	remaining: 17.7s
6:	learn: 1.8121180	test: 1.8283985	best: 1.8283985 (6)	total: 124ms	remaining: 17.6s
7:	learn: 1.7450111	test: 1.7628202	best: 1.7628202 (7)	total: 142ms	remaining: 17.6s
8:	learn: 1.6815915	test: 1.7031995	best: 1.7031995 (8)	total: 164ms	remaining: 18s
9:	learn: 1.6233764	test: 1.6482502	best: 1.6482502 (9)	total: 191ms	remaining: 18.9s
10:	learn: 1.5648719	test: 1.5931368	best: 1.5931368 (10)	total: 209ms	remaining: 18.8s
11:	learn: 1.5124529	test:

In [72]:
#tabnet
from pytorch_tabnet.tab_model import TabNetRegressor

clf = TabNetRegressor(optimizer_params={"lr": 0.04},n_steps=5)

# Train the model
clf.fit(X_train.values, y_train.values.reshape(-1, 1), max_epochs=50,batch_size=512,patience=3)

# Make predictions
y_pred= clf.predict(X_test.values)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
mpe = np.mean((y_test.values - y_pred) / y_test.values) * 100
print(f'Mean Percentage Error: {mpe}%')




epoch 0  | loss: 4.00532 |  0:00:01s
epoch 1  | loss: 0.67042 |  0:00:02s
epoch 2  | loss: 0.51758 |  0:00:03s
epoch 3  | loss: 0.47675 |  0:00:04s
epoch 4  | loss: 0.44835 |  0:00:05s
epoch 5  | loss: 0.47326 |  0:00:06s
epoch 6  | loss: 0.45776 |  0:00:07s
epoch 7  | loss: 0.45126 |  0:00:08s
epoch 8  | loss: 0.44031 |  0:00:09s
epoch 9  | loss: 0.43026 |  0:00:10s
epoch 10 | loss: 0.39276 |  0:00:11s
epoch 11 | loss: 0.38756 |  0:00:12s
epoch 12 | loss: 0.38851 |  0:00:13s
epoch 13 | loss: 0.38124 |  0:00:14s
epoch 14 | loss: 0.36737 |  0:00:15s
epoch 15 | loss: 0.35898 |  0:00:16s
epoch 16 | loss: 0.36463 |  0:00:17s
epoch 17 | loss: 0.36368 |  0:00:18s
epoch 18 | loss: 0.3523  |  0:00:20s
epoch 19 | loss: 0.36569 |  0:00:21s
epoch 20 | loss: 0.36316 |  0:00:22s
epoch 21 | loss: 0.35285 |  0:00:23s
epoch 22 | loss: 0.34648 |  0:00:24s
epoch 23 | loss: 0.35237 |  0:00:25s
epoch 24 | loss: 0.3475  |  0:00:26s
epoch 25 | loss: 0.34185 |  0:00:27s
epoch 26 | loss: 0.35131 |  0:00:28s
e

In [99]:
#my NN 
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(1024, activation='relu', input_shape=[8]),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(512, activation='relu'),
    layers.Dense(1024, activation='relu'),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='sigmoid'),
    layers.Dense(1),
])
# model.compile(
#     optimizer='adam',
#     loss='mse'  
# )
model.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Adam(learning_rate=0.05),
              metrics=["accuracy","mae"])

history = model.fit(
    X_train.values, y_train.values,
    validation_data=(X_test.values, y_test.values),
    batch_size=256,
    epochs=20,
    verbose=0,
    shuffle=True,
)
y_pred= clf.predict(X_test.values)

mse = mean_squared_error(y_test.values, y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
mpe = np.mean((y_test.values - y_pred) / y_test.values) * 100
print(f'Mean Percentage Error: {mpe}%')


Mean Squared Error: 0.5177422419543353
R-squared: 0.6222088026004138
Mean Percentage Error: -20.10071180411232%


In [73]:
X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
17557,2.2583,42.0,3.761765,1.052941,1254.0,3.688235,37.33,-121.89
6628,2.3029,45.0,4.472603,1.136986,3117.0,5.337329,34.18,-118.15
4102,5.5683,19.0,5.287500,1.077083,2021.0,2.105208,34.14,-118.39
14286,3.3700,36.0,4.960130,1.045566,3093.0,2.516680,32.72,-117.12
12462,4.0972,38.0,5.595982,0.995536,888.0,1.982143,38.57,-121.43
...,...,...,...,...,...,...,...,...
16329,5.1202,30.0,6.170663,0.971791,2038.0,2.874471,38.02,-121.34
1760,3.1571,43.0,4.976127,1.031830,807.0,2.140584,37.94,-122.33
16869,5.7398,42.0,6.282353,1.027451,705.0,2.764706,37.61,-122.41
2105,1.8641,39.0,4.547862,1.146640,2031.0,4.136456,36.75,-119.76
