In [1]:
import pandas as pd




In [2]:
# Load the California Housing dataset
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()
data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
data['PRICE'] = california_housing.target

In [3]:
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
#histogram for every feature of the dataframe 
for x in data.columns:
  data.hist(column = x, bins = 100)

In [None]:
from matplotlib import pyplot as plt

for x in data.columns:
  data.boxplot(column = x)
  plt.show()

In [None]:
import seaborn as sns
corr = data.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = data.drop('PRICE', axis=1)
y = data['PRICE']
X_train, X_test, y_train, y_test = train_test_split(
X,y , random_state=51,test_size=0.3, shuffle=True)



In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
from sklearn.metrics import mean_squared_error, r2_score , mean_absolute_error
from sklearn.linear_model import RidgeCV

# Create and train the Ridge regression model using cross validation for alphaParameters
ridge_model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10], cv=5)
ridge_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = ridge_model.predict(X_test)

# Evaluate the model's performance using mse-mae-
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

mae = mean_absolute_error(y_test, y_pred)
print("mae",mae)

Mean Squared Error: 0.5550405537343013
R-squared: 0.5764371559180014
mae 0.5332432565798635


In [10]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

#logistic regression

model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

y_pred = model_linear.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {:.2f}".format(mse))

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

Mean Squared Error: 0.51
R-squared: 0.6267053027976143


In [11]:
#Desicion tree
from sklearn.tree import DecisionTreeRegressor

clf = DecisionTreeRegressor(max_depth=10, random_state=0)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(mse)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

0.42646625737274646
R-squared: 0.6888119512612991


In [12]:
#random forest 
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(n_estimators=150,max_depth=20,random_state=1)
#forest_model.fit(X_train, y_train)
# Perform cross-validation
cv_scores = cross_val_score(forest_model, X_train, y_train, cv=5)  # 5-fold cross-validation
forest_model.fit(X_train, y_train)
# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
y_pred = forest_model.predict(X_test)

print("score",forest_model.score(X_test, y_test))
print(mean_absolute_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
print(mse)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

Cross-validation scores: [0.79516559 0.79419588 0.81325182 0.79599414 0.79229343]
score 0.81636571371345
0.3311656086063108
0.2516607791184781
R-squared: 0.81636571371345


In [13]:
from sklearn.ensemble import ExtraTreesRegressor

#extra trees
reg = ExtraTreesRegressor(n_estimators=150,max_depth=20, random_state=1).fit(X_train, y_train)
cv_scores = cross_val_score(reg, X_train, y_train, cv=5)
# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
y_pred = reg.predict(X_test)


print("score",reg.score(X_test, y_test))
print("mae",mean_absolute_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
print("mse",mse)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

Cross-validation scores: [0.79643416 0.7942357  0.815571   0.79514921 0.79313044]
score 0.8174820231372872
mae 0.3332855056374472
mse 0.2501309379051559
R-squared: 0.8174820231372872


In [14]:
#xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Assuming you have X_train, X_test, y_train, and y_test defined

# Create an XGBoost regressor with early stopping parameters
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)

# Perform cross-validation (K-Fold is commonly used for regression tasks)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(my_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Fit the model on the entire training set
my_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Make predictions on the test set
y_pred = my_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
print("score",reg.score(X_test, y_test))
print("mae",mean_absolute_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
print("mse",mse)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')


Cross-validation scores: [-0.2081444  -0.21046931 -0.22705656 -0.21440367 -0.20744022]
Mean Squared Error: 0.2056942835789989
R-squared: 0.8499069934911473
score 0.8174820231372872
mae 0.29648786117945175
mse 0.2056942835789989
R-squared: 0.8499069934911473


In [15]:
# CatBoostRegressor
from catboost import CatBoostRegressor
catboost_model = CatBoostRegressor(iterations=1000, learning_rate=0.2, depth=6, loss_function='RMSE', random_state=42)

# Fit the model on the training set
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=10)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
print(f'R-squared: {r2}')
print("score",reg.score(X_test, y_test))
print("mae",mean_absolute_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
print("mse",mse)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

0:	learn: 1.0254804	test: 1.0441380	best: 1.0441380 (0)	total: 140ms	remaining: 2m 19s
10:	learn: 0.6258168	test: 0.6241110	best: 0.6241110 (10)	total: 170ms	remaining: 15.2s
20:	learn: 0.5548763	test: 0.5543612	best: 0.5543612 (20)	total: 199ms	remaining: 9.26s
30:	learn: 0.5259671	test: 0.5294141	best: 0.5294141 (30)	total: 227ms	remaining: 7.11s
40:	learn: 0.5079382	test: 0.5171702	best: 0.5171702 (40)	total: 256ms	remaining: 6s
50:	learn: 0.4895343	test: 0.5048311	best: 0.5048311 (50)	total: 285ms	remaining: 5.29s
60:	learn: 0.4749014	test: 0.4954642	best: 0.4954642 (60)	total: 314ms	remaining: 4.83s
70:	learn: 0.4629164	test: 0.4891895	best: 0.4891895 (70)	total: 342ms	remaining: 4.48s
80:	learn: 0.4523644	test: 0.4842838	best: 0.4842838 (80)	total: 373ms	remaining: 4.23s
90:	learn: 0.4433066	test: 0.4799770	best: 0.4799770 (90)	total: 403ms	remaining: 4.02s
100:	learn: 0.4347547	test: 0.4758056	best: 0.4758056 (100)	total: 431ms	remaining: 3.84s
110:	learn: 0.4250668	test: 0.4703

In [16]:
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Assuming you have X and y defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a CatBoostRegressor
catboost_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=10, loss_function='RMSE', random_state=42)

# Specify the dataset format for CatBoost
train_pool = Pool(X_train, label=y_train)
test_pool = Pool(X_test, label=y_test)

# Perform cross-validation
cv_results = cv(params=catboost_model.get_params(), pool=train_pool, fold_count=5, type='TimeSeries')

# Print the cross-validation results
print("Cross-validation results:", cv_results)

# Fit the model on the entire training set
catboost_model.fit(train_pool, eval_set=test_pool, verbose=100)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')


Training on fold [0/5]
0:	learn: 2.3063103	test: 2.2823733	best: 2.2823733 (0)	total: 16.2ms	remaining: 16.2s
1:	learn: 2.2179970	test: 2.1967480	best: 2.1967480 (1)	total: 32.5ms	remaining: 16.2s
2:	learn: 2.1284489	test: 2.1098261	best: 2.1098261 (2)	total: 47.9ms	remaining: 15.9s
3:	learn: 2.0478891	test: 2.0318801	best: 2.0318801 (3)	total: 64.4ms	remaining: 16s
4:	learn: 1.9695470	test: 1.9568426	best: 1.9568426 (4)	total: 80.1ms	remaining: 15.9s
5:	learn: 1.8962290	test: 1.8861223	best: 1.8861223 (5)	total: 95.2ms	remaining: 15.8s
6:	learn: 1.8224718	test: 1.8153238	best: 1.8153238 (6)	total: 110ms	remaining: 15.6s
7:	learn: 1.7537473	test: 1.7483718	best: 1.7483718 (7)	total: 125ms	remaining: 15.5s
8:	learn: 1.6922840	test: 1.6894904	best: 1.6894904 (8)	total: 140ms	remaining: 15.4s
9:	learn: 1.6321001	test: 1.6322322	best: 1.6322322 (9)	total: 156ms	remaining: 15.5s
10:	learn: 1.5728298	test: 1.5758056	best: 1.5758056 (10)	total: 173ms	remaining: 15.5s
11:	learn: 1.5173875	test