# Imports

In [34]:
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
import concurrent.futures

# Read tabular data

In [None]:
tabularData = pd.read_csv('socal2.csv', sep=',', header=0)
tabularData = tabularData[:2000]

# Extract Features from image data via VGG16 CNN

In [35]:
# Load the pre-trained VGG16 model without the top layers (fully connected layers)
base_model = VGG16(weights='imagenet', include_top=False)

# Function to preprocess an image for VGG16
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(415, 311))  # VGG16 input size
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    return img_array

# Function to extract features using VGG16
def extract_features(img_path):
    img_array = preprocess_image(img_path)
    features = base_model.predict(img_array);
    return features.flatten() 


#num_images = 15474
num_images = 2000

# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit tasks for processing images
    future_to_image = {executor.submit(extract_features, f'./socal2/socal_pics/{x}.jpg'): x for x in range(num_images)}

    # Iterate over completed futures in the order they were submitted
    features = [future.result() for future in concurrent.futures.as_completed(future_to_image)]



# Concatenate both data sources into one dataframe

In [37]:
# create dataframe from features list
imageData = pd.DataFrame(features[0])
for feature in features[1:]:
    temp = pd.DataFrame(feature)
    dataframe = pd.concat([dataframe, temp], axis=1)
imageData = imageData.transpose()
imageData.columns = [f'pixel {i}' for i in range(imageData.shape[1])]
data = pd.concat([imageData.reset_index(), tabularData], axis=1)

# Transform data with pipeline

In [39]:
numericalColumns = [column for column in data.columns if data[column].dtype == np.float64]
categoricalColumns = [column for column in data.columns if data[column].dtype == object]
numericalColumnsTransformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
categoricalColumnsTransformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', numericalColumnsTransformer, numericalColumns), ('cat', categoricalColumnsTransformer, categoricalColumns)])

y = data['price']
X = data.drop(columns=['price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

rf = RandomForestRegressor(n_estimators=1000, random_state=0, n_jobs=-1, max_depth=10, min_samples_leaf=2)

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', rf)])

# Make prediction

In [40]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"R2-Score: {r2_score(y_test, y_pred)}")

R2-Score: 0.42998550586444717
