In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## POSTURE DETECTION FINISH

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [83]:
# reading the csv
main_file = pd.read_csv("Rw_data.csv")

# data preprocessing
# DATA IS CLEAN 
# get_dummies() if we had to encode lol

# x will be the parameters im not guessing 
X = main_file.drop(columns=["next_sdi_predicted", "exact_date"])
# X

# y will be the predicted sdi
Y = main_file["next_sdi_predicted"]
# Y

# 80% train 20% test

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=20, stratify=None)


# choose random forest for now 
rf = RandomForestRegressor(random_state=20)
rf.fit(x_train, y_train)

y_predict = rf.predict(x_test)
file = pd.DataFrame({
    "model_sdi": y_predict,
    "actual_sdi": y_test.values
})
print(file.head(20))

# print the mse, rSquare 
print(mean_absolute_error(y_test, y_predict)
,mean_squared_error(y_test, y_predict)
,r2_score(y_test, y_predict))

param_grid = { 'n_estimators': [100, 200, 300],
               'max_depth': [ 10, 20, 30],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4], 
               }

rf_cv = GridSearchCV(estimator=rf, param_grid=param_grid, cv=2, scoring='neg_mean_squared_error', n_jobs=-1)
# THIS IS TAKING TIME BCZ MY MAC IS SLOWWW
rf_cv.fit(x_train, y_train)

y_predict = rf_cv.predict(x_test)
# print(y_predict)
# whatr we can do is add pd.DataFrame and then assign "actual" and "perdicted" sdi with y_test.values and y_predict

print(mean_absolute_error(y_test, y_predict)
,mean_squared_error(y_test, y_predict)
,r2_score(y_test, y_predict))


# now to fine tune we will not see only for random forest we will see for other regression models aswell
# see the hyperparameter from the documentation and then test out atleast 3 models with diff hyperparameters
# this will be done in a for loop and using GridSearchCV

models = {
    "RandomForest": RandomForestRegressor(random_state=20),
    "GradientBoosting": GradientBoostingRegressor(random_state=20),
    "SVR": SVR()
}

param_grids = {
    "RandomForest": {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    },
    "GradientBoosting": {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    },
    "SVR": {
        'kernel': ['rbf', 'linear'],
        'C': [1, 10],
        'epsilon': [0.01, 0.1]
    }
}

# now try using RandomSearchCV

for model_name in models:
    model = models[model_name]
    param_dist = param_grids[model_name]
    random_search = RandomizedSearchCV(estimator= model, param_distributions= param_dist, n_iter= 4,
        scoring= 'neg_mean_squared_error', cv= 3, n_jobs= -1, random_state= 20)
    
    random_search.fit(x_train, y_train)
    y_predict = random_search.predict(x_test)
    print(mean_absolute_error(y_test, y_predict)
    ,mean_squared_error(y_test, y_predict)
    ,r2_score(y_test, y_predict))
    

# see the mse, rSquare for all and then decide whihc is the best model for now 



    model_sdi  actual_sdi
0    3.179669    3.177979
1    3.143396    3.143553
2    3.101915    3.101991
3    3.298022    3.297562
4    3.054456    3.060620
5    3.187102    3.187098
6    3.026963    3.027349
7    3.712260    3.713805
8    2.826191    2.826815
9    3.114701    3.129208
10   3.098496    3.103567
11   2.899296    2.899819
12   2.847668    2.843170
13   3.533519    3.534802
14   3.089538    3.087757
15   3.291348    3.292364
16   2.727566    2.741756
17   2.923719    2.973495
18   2.913857    2.910576
19   3.393675    3.431809
0.007400544689464532 0.0004160578743120266 0.9966598006320292
0.007498145459232076 0.00042444257370053144 0.9965924865170291
0.00747406841605287 0.0004198620842813449 0.9966292596411727
0.0063375745016075805 0.00023776444111815238 0.9980911774900018
0.010377850131948547 0.00022998334266550834 0.9981536457708308
