In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import itertools
from scipy import stats

from statsmodels.stats.outliers_influence import variance_inflation_factor
from IPython.display import display

In [2]:
sns.set()

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [140]:
INPUT_FILE_NAME = "final_data.csv"

In [141]:
data = pd.read_csv(
    os.path.join(os.path.dirname(os.path.realpath("__file__")), "data", INPUT_FILE_NAME),
)

In [142]:
#'SE', 
#'nutrition_score', 
#'parental_connection_score', 
#'decision_making_score', 
#'sp_harassment_score', 
#'mhm_score', 
#'pi_sa_score', 
#'body_image_score', 
#'srh_score'

dependent_variable = ["SE"]

In [143]:
# 'Age_imp', 'Marital Status', 'Edu Status'

independent_variable = [
#     'Age_imp',
#     'SE',
#     'Marital Status',
    'Edu Status',
#     'sp_harassment_score',
#     'mhm_score',
#     'pi_sa_score',
#     'body_image_score',
#     'srh_score'
]

In [144]:
data_model = data[independent_variable + dependent_variable]

In [145]:
dtypes = data_model[independent_variable].dtypes
cat_variables = dtypes[dtypes=='object'].index.tolist()
int_variables = dtypes[(dtypes!='object') & (dtypes!='bool')].index.tolist()
bool_variables = dtypes[dtypes=='bool'].index.tolist()

In [146]:
data_model[bool_variables] = data_model[bool_variables].astype(np.int64)

In [147]:
if len(cat_variables)!=0:
    data_model_dummy = pd.get_dummies(
        data=data_model[cat_variables],
        ###################################### IDHAR DEKHO #######################################
        drop_first=True,
        #########################################################################################
    )
    data_model = pd.concat([
        data_model.drop(columns=cat_variables),
        data_model_dummy,
        ],
        axis=1,
    )

In [148]:
model_columns = np.setdiff1d(
    np.setdiff1d(
        data_model.columns,
        cat_variables,
    ),
    dependent_variable,
).tolist()

In [149]:
data_model = sm.add_constant(data_model, prepend=True)

In [150]:
data_model[int_variables + dependent_variable] = (data_model[int_variables + dependent_variable] - data_model[int_variables + dependent_variable].min()) / (data_model[int_variables + dependent_variable].max() - data_model[int_variables + dependent_variable].min())

In [151]:
mod = sm.OLS(
    data_model[dependent_variable], 
    data_model[['const'] + model_columns],
)

res = mod.fit()

In [152]:
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                     SE   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     6.531
Date:                Mon, 04 Jul 2022   Prob (F-statistic):             0.0109
Time:                        16:03:38   Log-Likelihood:                 162.78
No. Observations:                 510   AIC:                            -321.6
Df Residuals:                     508   BIC:                            -313.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.5588    

In [139]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [123]:
X_train, X_test, y_train, y_test = train_test_split(
    data_model[model_columns],
    data_model[dependent_variable], 
    test_size=0.2, 
    random_state=42,
)

In [124]:
regressor = DecisionTreeRegressor(
#     criterion="squared_error",
    max_depth=2,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=0,
) 
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

r2_score(y_test, y_pred)

0.009899758218384958

In [125]:
regressor = RandomForestRegressor(
    n_estimators=51,
    max_depth=2,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=0,
)

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

r2_score(y_test, y_pred)

  regressor.fit(X_train, y_train)


0.010561585110798521