In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='ffill', inplace=True)

numeric_cols = [col for col in train_df.select_dtypes(include=[np.number]).columns if col != 'pSat_Pa']

Q1 = train_df[numeric_cols].quantile(0.25)
Q3 = train_df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

train_df_no_outliers = train_df[~((train_df[numeric_cols] < (Q1 - 1.5 * IQR)) | (train_df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

skewed_feats = train_df_no_outliers[numeric_cols].apply(lambda x: x.skew()).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew': skewed_feats})
skewness = skewness[abs(skewness['Skew']) > 0.75]

for feat in skewness.index:
    train_df_no_outliers[feat] = np.log1p(train_df_no_outliers[feat])

X_scaled = StandardScaler().fit_transform(train_df_no_outliers[numeric_cols])

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

test_df_scaled = StandardScaler().fit_transform(test_df[numeric_cols])
test_pca = pca.transform(test_df_scaled)


In [None]:


train_df_no_outliers['log_pSat_Pa'] = np.log10(train_df_no_outliers['pSat_Pa'])

X_train, X_test, y_train, y_test = train_test_split(X_pca, train_df_no_outliers['log_pSat_Pa'], test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=0)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

rf_r2 = r2_score(y_test, rf_predictions)
print("Random Forest R^2 Score: ", rf_r2)

cb_model = CatBoostRegressor(iterations=500, depth=4, learning_rate=0.02, loss_function='RMSE', verbose=True)
cb_model.fit(X_train, y_train)
cb_predictions = cb_model.predict(X_test)

cb_r2 = r2_score(y_test, cb_predictions)
print("CatBoost R^2 Score: ", cb_r2)

rf_test_predictions = rf_model.predict(test_pca)
rf_submission = pd.DataFrame({'Id': test_df['Id'], 'target': rf_test_predictions})
rf_submission.to_csv('rf_submission.csv', index=False)

cb_test_predictions = cb_model.predict(test_pca)
cb_submission = pd.DataFrame({'Id': test_df['Id'], 'target': cb_test_predictions})
cb_submission.to_csv('cb_submission.csv', index=False)
