# From Data to Action: Machine Learning Approaches for Predicting Tobacco-Free Policy Implementation in Schools 
## Random Forest Classifier

Loading imputed dataset

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import scipy.stats as st

IMAGES_PATH = Path() / "plots"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
# Read the two dataframes
df = pd.read_csv('/main/tobaccoFree/data/imputed_data.csv')


df.head()

## Preparing for the model

Getting x and y variables

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
np.random.seed(42)
X = df.drop(['tobaccoFree'], axis=1)
y = df['tobaccoFree']



Original frequency histogram

In [None]:
X.hist(figsize = (30,20))
save_fig("unscaled_bar_plot")  
plt.show()

Scaling features data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

*scaliling with standardization allows for faster convergence by setting mean to 0 and std dev to 1*

train test split

In [None]:
X

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# X_train


In [None]:
X_train.shape, X_test.shape

In [None]:
X_train.dtypes

In [None]:
X_train.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=0)

rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
pred_kfold = cross_val_score(rfc, X_train, y_train, cv=42) 
print("Accuracy with SVM and K-FOLD CROSS VALIDATION: %0.2f (+/- %0.2f)" % (pred_kfold.mean(), pred_kfold.std() * 2))

In [None]:
rfc_100 = RandomForestClassifier(n_estimators=100, random_state=0)

rfc_100.fit(X_train, y_train)

y_pred_100 = rfc_100.predict(X_test)

print('Model accuracy score with 100 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred_100)))

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)

clf.fit(X_train, y_train)

In [None]:
feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)

feature_scores

In [None]:
from sklearn.metrics import accuracy_score

print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
fig, ax = plt.subplots(figsize=(20,30))
#sns.barplot(ax=ax, x=feature_scores[:30], y=feature_scores.index[:30])


# Define the colormap
cmap = plt.get_cmap('viridis', 30)  # 30 colors for 30 bars

# Get the default bar colors from seaborn
colors = sns.barplot(ax=ax, x=feature_scores[:30], y=feature_scores.index[:30]).patches

# Assign gradient colors to each bar
for i, bar in enumerate(colors):
    bar.set_facecolor(cmap(i))

plt.xlabel('Feature Importance Score', fontsize=54)

plt.ylabel('Features', fontsize=54)

plt.title("Visualizing Important Features With \nRandom Forest Classifiers", fontsize=72)
ax.text(.05, 10.1, 'The acuracy of the\nRandom Forest model: {0:0.3f}'. format(accuracy_score(y_test, y_pred)), fontsize=30, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.xticks(fontsize=24)  # Adjust font size as needed
plt.yticks(fontsize=24)  # Adjust font size as needed
save_fig('RFC')
plt.show()