In [520]:
import pandas as pd

## Notebook for making predictions with different models etc. 


### Reading dataset stored in other notebooks

In [521]:

%store -r data_A
%store -r data_B
%store -r data_C

data_A = data_A
data_B = data_B
data_C = data_C

%store -r X_test_estimated_a_corr 
%store -r X_test_estimated_b_corr 
%store -r X_test_estimated_c_corr

X_test_estimated_a_corr = X_test_estimated_a_corr 
X_test_estimated_b_corr = X_test_estimated_b_corr
X_test_estimated_c_corr = X_test_estimated_c_corr

print(X_test_estimated_a_corr.shape)


(720, 49)


### Preparing training and testing data to train on

- we split the training sets into train and test data. In this way we can test our model.


In [522]:
#preparing data
X_A = data_A.iloc[:,2:-1]  #independent columns
y_A = data_A.iloc[:,1]    #target column i.e uv stråling 

X_B = data_B.iloc[:,2:-1]  #independent columns
y_B = data_B.iloc[:,1]    #target column i.e uv stråling 

X_C = data_C.iloc[:,2:-1]  #independent columns
y_C = data_C.iloc[:,1]    #target column i.e uv stråling 


X_A_train, X_A_test, y_A_train, y_A_test = ms.train_test_split(X_A, y_A)
X_B_train, X_B_test, y_B_train, y_B_test = ms.train_test_split(X_B, y_B)
X_C_train, X_C_test, y_C_train, y_C_test = ms.train_test_split(X_C, y_C)

print(X_A_train.shape)
print(X_B_train.shape)
print(X_C_train.shape)

(25563, 47)
(25563, 47)
(24116, 47)


In [523]:
# fixing test data

X_test_A = X_test_estimated_a_corr[X_A_train.columns.intersection(X_test_estimated_a_corr.columns)]
X_test_B = X_test_estimated_b_corr[X_A_train.columns.intersection(X_test_estimated_b_corr.columns)]
X_test_C = X_test_estimated_c_corr[X_A_train.columns.intersection(X_test_estimated_c_corr.columns)]

X_test_A = X_test_A.interpolate("ffill")
X_test_B = X_test_B.interpolate("ffill")
X_test_C = X_test_C.interpolate("ffill")

X_test_A = X_test_A.fillna(0.0)
X_test_B = X_test_B.fillna(0.0)
X_test_C = X_test_C.fillna(0.0)

### Preds using GradientBoostingRegressor



In [554]:
from sklearn.ensemble import GradientBoostingRegressor

## defining model and hyper-parameters
model_A = GradientBoostingRegressor(learning_rate=0.5, n_estimators=150)
model_B = GradientBoostingRegressor(learning_rate=0.5, n_estimators=150)
model_C = GradientBoostingRegressor(learning_rate=0.5, n_estimators=150)

model = GradientBoostingRegressor(learning_rate=0.5, n_estimators=150)


In [555]:
# training model 
model_A.fit(X_A_train, y_A_train)
model_B.fit(X_B_train, y_B_train)
model_C.fit(X_C_train, y_C_train)

model.fit(X_B_train, y_B_train)
model.fit(X_C_train, y_C_train)

In [564]:
# testing model 

score_A = model_A.score(X_A_test, y_A_test)
score_B = model_B.score(X_B_test, y_B_test)
score_C = model_C.score(X_C_test, y_C_test)

score_mA = model.score(X_A_test, y_A_test)
score_mB = model.score(X_B_test, y_B_test)
score_mC = model.score(X_C_test, y_C_test)

print(score_A)
print(score_B)
print(score_C)

print(score_mA)
print(score_mB)
print(score_mC)


0.8448444341476271
0.7376732852455354
0.8026517107711649
-0.0647866485512163
-0.10074060028270648
0.8022807679054267


## Random forest regressor 

In [540]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

rfmA = RandomForestRegressor(max_depth=6, max_features=None, max_leaf_nodes=10)
rfmB = RandomForestRegressor(max_depth=6, max_features=None, max_leaf_nodes=10, n_estimators=50)
rfmC = RandomForestRegressor(max_depth=6, max_features=None, max_leaf_nodes=10, n_estimators=25)

rfmA = RandomForestRegressor(max_leaf_nodes=10)
rfmB = RandomForestRegressor(max_leaf_nodes=10)
rfmC = RandomForestRegressor(max_leaf_nodes=10)

rfm = RandomForestRegressor(max_depth=10)

### Finding best hyperparameters 
Below code takes around 1 hour to run

Runing the below code found that
- max depth = 6 
- max features = none 
- max leaf nodes = 9 (propably better with higher)
- for A n estimators = default 
- for B n estimators = 50 
- for C n estimators = 25 

In [532]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

grid_search_A = GridSearchCV(RandomForestRegressor(), param_grid=param_grid)
grid_search_A.fit(X_A_train, y_A_train)

grid_search_B = GridSearchCV(RandomForestRegressor(), param_grid=param_grid)
grid_search_B.fit(X_B_train, y_B_train)

grid_search_C = GridSearchCV(RandomForestRegressor(), param_grid=param_grid)
grid_search_C.fit(X_C_train, y_C_train)

print("A: " , grid_search_A.best_estimator_)

print("B: ", grid_search_B.best_estimator_)

print("C: ", grid_search_C.best_estimator_)



A:  RandomForestRegressor(max_depth=6, max_features=None, max_leaf_nodes=9)
B:  RandomForestRegressor(max_depth=6, max_features=None, max_leaf_nodes=9,
                      n_estimators=50)
C:  RandomForestRegressor(max_depth=6, max_features=None, max_leaf_nodes=9,
                      n_estimators=25)


In [541]:
rfmA.fit(X_A_train, y_A_train)
rfmB.fit(X_B_train, y_B_train)
rfmC.fit(X_C_train, y_C_train)


In [553]:

score_A = rfmA.score(X_A_test, y_A_test)
score_B = rfmB.score(X_B_test, y_B_test)
score_C = rfmC.score(X_C_test, y_C_test)


print(score_A)
print(score_B)
print(score_C)



0.8163345334815306
0.7028435503765282
0.7000239441453882


# Create subission

## NB! THE RESULTING CSV FILE IS MISSING THE "id" LABLE FOR ITS FIRST COLUMN; QUICK FIX: JUST GO INTO THE CSV IN VS CODE AND ADD MANUALLY AT THE TOP

In [539]:
## Specify the model to be used
pred_A = rfmA.predict(X_test_A)
pred_B = rfmB.predict(X_test_B)
pred_C = rfmC.predict(X_test_C)

print(X_test_A.shape)
test = pd.read_csv('test.csv')

df_A = pd.DataFrame()

df_A["prediction"] = pred_A
df_A["location"] = "A"

df_B = pd.DataFrame()

df_B["prediction"] = pred_B
df_B["location"] = "B"

df_C = pd.DataFrame()

df_C["prediction"] = pred_C
df_C["location"] = "C"

df_mid = pd.concat([df_A, df_B], ignore_index=True)

df = pd.concat([df_mid, df_C], join="inner", ignore_index=True)

df_C['prediction'] = df_C['prediction'].astype(float)

test = test[test.columns.intersection(["id", "time", "location"])]



df = df.drop("location", axis=1)

df.set_index(test["id"])
# df["id"] = test["id"]

# df = df[["id", "prediction"]]

df[df < 0 ] = 0

# NAME THE FILE 
df.to_csv("sub9.csv")

df


(720, 47)


Unnamed: 0,prediction
0,19.020580
1,19.020580
2,19.020580
3,19.020580
4,417.013415
...,...
2155,17.713280
2156,17.713280
2157,17.713280
2158,17.713280
