In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

In [30]:
df = pd.read_table('CAF357.txt', sep = '\t', na_values= 'NA')
first_valid_index = df.iloc[:, 2:].notna().any(axis=1).idxmax()
df_trimmed_top = df.iloc[first_valid_index:].reset_index(drop=True)
df_trimmed_top.head()

Unnamed: 0,Location,Date,VW_30cm,VW_60cm,VW_90cm,VW_120cm,VW_150cm,T_30cm,T_60cm,T_90cm,T_120cm,T_150cm
0,CAF357,05/19/2009,0.299,0.321,0.359,0.331,0.314,11.87,10.1,8.7,7.61,7.09
1,CAF357,05/20/2009,0.301,0.325,0.363,0.335,0.32,11.22,10.09,8.75,7.61,7.0
2,CAF357,05/21/2009,,,,,,,,,,
3,CAF357,05/22/2009,,0.328,0.368,0.342,0.327,12.1,10.11,9.06,7.94,7.22
4,CAF357,05/23/2009,,0.329,0.369,0.343,0.327,12.25,10.4,9.16,8.03,7.32


In [31]:
last_valid_index = df_trimmed_top.iloc[:, 2:].notna().any(axis=1)[::-1].idxmax()
df_final_trimmed = df_trimmed_top.iloc[:last_valid_index + 1].reset_index(drop=True)
df_final_trimmed.to_csv('CAF357_trimmed_test.csv', index=False)

In [4]:
df_final_trimmed.isnull().sum()

Location      0
Date          0
VW_30cm     487
VW_60cm     267
VW_90cm     367
VW_120cm    319
VW_150cm    217
T_30cm      474
T_60cm      267
T_90cm      367
T_120cm     319
T_150cm     217
dtype: int64

In [5]:
df_final_trimmed.isnull().sum().sum()

3301

In [6]:
num_rows_left = df_final_trimmed.shape[0]
print(f"Number of rows left after cleaning: {num_rows_left}")

Number of rows left after cleaning: 2586


In [7]:
df_final_trimmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2586 entries, 0 to 2585
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Location  2586 non-null   object 
 1   Date      2586 non-null   object 
 2   VW_30cm   2099 non-null   float64
 3   VW_60cm   2319 non-null   float64
 4   VW_90cm   2219 non-null   float64
 5   VW_120cm  2267 non-null   float64
 6   VW_150cm  2369 non-null   float64
 7   T_30cm    2112 non-null   float64
 8   T_60cm    2319 non-null   float64
 9   T_90cm    2219 non-null   float64
 10  T_120cm   2267 non-null   float64
 11  T_150cm   2369 non-null   float64
dtypes: float64(10), object(2)
memory usage: 242.6+ KB


In [8]:
data = pd.read_csv('CAF357_trimmed_test.txt', sep='\t')

data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data = data.drop(columns=['Date'])

label_encoder = LabelEncoder()
data['Location'] = label_encoder.fit_transform(data['Location'])

columns_to_fill = data.columns.difference(['VW_30cm'])

for column in data.columns:
    if data[column].isna().any():
        columns_to_fill_temp = data.columns.difference([column])
        data[columns_to_fill_temp] = data[columns_to_fill_temp].fillna(data[columns_to_fill_temp].median())
        
        data_missing = data[data[column].isna()]
        data_not_missing = data[~data[column].isna()]
        
        features = data.columns.difference([column])
        X = data_not_missing[features]
        y = data_not_missing[column]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        
        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)
        model.fit(X_train, y_train)
        
        X_missing = data_missing[features]
        predicted_values = model.predict(X_missing)
        data.loc[data[column].isna(), column] = predicted_values

data.to_csv('CAF357_cleaned_xgb_complete.csv', index=False)

In [9]:
df=pd.read_csv('CAF357_cleaned_xgb_complete.csv')
df.head()

Unnamed: 0,Location,VW_30cm,VW_60cm,VW_90cm,VW_120cm,VW_150cm,T_30cm,T_60cm,T_90cm,T_120cm,T_150cm,Day,Month,Year
0,0,0.299,0.321,0.359,0.331,0.314,11.87,10.1,8.7,7.61,7.09,19,5,2009
1,0,0.301,0.325,0.363,0.335,0.32,11.22,10.09,8.75,7.61,7.0,20,5,2009
2,0,0.25392,0.282,0.362,0.277,0.307,9.3255,7.75,9.62,8.9,8.8,21,5,2009
3,0,0.27745,0.328,0.368,0.342,0.327,12.1,10.11,9.06,7.94,7.22,22,5,2009
4,0,0.271642,0.329,0.369,0.343,0.327,12.25,10.4,9.16,8.03,7.32,23,5,2009


In [10]:
df.isnull().sum()

Location    0
VW_30cm     0
VW_60cm     0
VW_90cm     0
VW_120cm    0
VW_150cm    0
T_30cm      0
T_60cm      0
T_90cm      0
T_120cm     0
T_150cm     0
Day         0
Month       0
Year        0
dtype: int64

In [11]:
data = pd.read_csv('CAF357_cleaned_xgb_complete.csv')
data['Location'] = label_encoder.inverse_transform(data['Location'])

data['Date'] = pd.to_datetime(data[['Year', 'Month', 'Day']]).dt.strftime('%m/%d/%Y')

data = data.drop(columns=['Day', 'Month', 'Year'])

correct_column_order = ['Location', 'Date', 'VW_30cm', 'VW_60cm', 'VW_90cm', 'VW_120cm', 'VW_150cm', 
                        'T_30cm', 'T_60cm', 'T_90cm', 'T_120cm', 'T_150cm']

data = data[correct_column_order]

data.to_csv('CAF357_final_cleaned_xgb_complete.csv', index=False)

data.head()

Unnamed: 0,Location,Date,VW_30cm,VW_60cm,VW_90cm,VW_120cm,VW_150cm,T_30cm,T_60cm,T_90cm,T_120cm,T_150cm
0,CAF357,05/19/2009,0.299,0.321,0.359,0.331,0.314,11.87,10.1,8.7,7.61,7.09
1,CAF357,05/20/2009,0.301,0.325,0.363,0.335,0.32,11.22,10.09,8.75,7.61,7.0
2,CAF357,05/21/2009,0.25392,0.282,0.362,0.277,0.307,9.3255,7.75,9.62,8.9,8.8
3,CAF357,05/22/2009,0.27745,0.328,0.368,0.342,0.327,12.1,10.11,9.06,7.94,7.22
4,CAF357,05/23/2009,0.271642,0.329,0.369,0.343,0.327,12.25,10.4,9.16,8.03,7.32


KNN

In [12]:

data = pd.read_csv('CAF357_final_cleaned_xgb_complete.csv')

features = data.drop(columns=['Date', 'Location', 'T_90cm'])

target = data['T_90cm']

In [13]:
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.5, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 1293
Validation set size: 646
Testing set size: 647


In [14]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [15]:
pca = PCA(n_components=2) 

X_train_pca = pca.fit_transform(X_train_scaled)

X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [16]:
param_grid = {'n_neighbors': range(1, 20)}

grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train_pca, y_train)

best_params = grid_search.best_params_
print(f'Best number of neighbors: {best_params["n_neighbors"]}')

Best number of neighbors: 2


In [17]:
best_knn = KNeighborsRegressor(n_neighbors=best_params['n_neighbors'])

best_knn.fit(X_train_pca, y_train)

y_pred_pca = best_knn.predict(X_test_pca)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_pca))
r2 = r2_score(y_test, y_pred_pca)
print(f'RMSE on test set: {rmse}')
print(f'R-squared on test set: {r2}')

RMSE on test set: 0.7412097475589162
R-squared on test set: 0.9691459071343137


RandomForest

In [20]:
data = pd.read_csv('CAF357_final_cleaned_xgb_complete.csv')
features = data.drop(columns=['Date', 'Location', 'T_90cm'])

target = data['T_90cm']

In [21]:
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.5, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [23]:
rf_model = RandomForestRegressor(n_estimators=150, random_state=42)

rf_model.fit(X_train, y_train)

In [26]:
y_val_pred = rf_model.predict(X_val)

val_mse = mean_squared_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)

print(f'Validation Mean Squared Error: {val_mse}')
print(f'Validation R^2 Score: {val_r2}')

Validation Mean Squared Error: 0.06369089870251136
Validation R^2 Score: 0.9962392101127328


In [27]:
y_test_pred = rf_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Test Mean Squared Error: {test_mse}')
print(f'Test R^2 Score: {test_r2}')

Test Mean Squared Error: 0.03247554767247174
Test R^2 Score: 0.9981761587999124


In [28]:
importances = rf_model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)

    Feature  Importance
7   T_120cm    0.879083
6    T_60cm    0.054744
5    T_30cm    0.029467
8   T_150cm    0.011991
2   VW_90cm    0.011349
3  VW_120cm    0.007743
4  VW_150cm    0.003331
0   VW_30cm    0.001252
1   VW_60cm    0.001040
