# From Data to Action: Machine Learning Approaches for Predicting Tobacco-Free Policy Implementation in Schools 
## Preprocessing

Loading excel cleaned and combined dataset

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

IMAGES_PATH = Path() / "plots"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
# Read the two dataframes
df = pd.read_csv('/main/tobaccoFree/data/base-dataset.csv')


df.head()

Getting Feature Variable Name List to csv

In [None]:
a = pd.read_csv('/main/tobaccoFree/data/attributes.csv', comment='#', header=None)
a

In [None]:
a = a.dropna(axis=1)
a

In [None]:
a = a.T
a.to_csv('/main/tobaccoFree/data/cols.csv', index=False, sep=',', encoding='utf-8', header=False)
a
df.info(verbose=True,show_counts=True)

Replacing with Current Dataset Column Names

In [None]:
cols = pd.read_csv('/main/tobaccoFree/data/cols.csv')
df.columns = cols.columns
df.info(verbose=True,show_counts=True)

Adding tobaccoFree Column

In [None]:
df['tobaccoFree'] = (df['totalCriteria'] > 10).astype(float)
df.info(verbose=True,show_counts=True)


Writing base dataset to file

In [None]:
df = df.to_csv('/main/tobaccoFree/data/base-dataset.csv', index=False, sep=',', encoding='utf-8')

Dropping columns with too many null values or with untrainable values

In [None]:
df.info(verbose=True,show_counts=True)

In [None]:
df = df.drop(['distPerfEdu', 'prcplIncome','MEnroll1','MEnroll2','MEnroll3','FEnroll1','FEnroll2','FEnroll3','totEnroll1','totEnroll2','totEnroll3','POTOC','numMTeach','numFTeach','teachStudRat','MtoFTeachRat','percHH_Has_Toilet','percPassHSC','prcplSchemeName', 'schoolName','prcplSchemeOtherName', 'percCriteria'], axis=1)
df.info(verbose=True,show_counts=True)

First correlation Heatmap

In [None]:

plt.figure(figsize=(70,50))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="YlGnBu")
save_fig('Heat Map')


## Feature Engineering

In [None]:
df.info(verbose=True,show_counts=True)

#### Getting Dummy Variables for Object data types

In [None]:
def get_dummies_and_reposition(df, prefix_sep='_'):
  """
  Creates dummy variables for object-type columns, inserts them after the original columns, and drops the originals.

  Args:
    df: The pandas DataFrame.
    prefix_sep: The separator to use between the column name and the category name in the dummy variable names.

  Returns:
    The modified DataFrame with dummy variables inserted and original object-type features dropped.
  """
  object_cols = df.select_dtypes(include=['object']).columns.tolist()
  for col in object_cols:
    dummies = pd.get_dummies(df[col], dtype=float, prefix=col, prefix_sep=prefix_sep)
    # Get the index of the original column
    col_index = df.columns.get_loc(col)
    # Insert dummies after the original column
    df = pd.concat([df.iloc[:, :col_index + 1], dummies, df.iloc[:, col_index + 1:]], axis=1)
    # Drop the original column
    df = df.drop(col, axis=1)
  return df

# Call the function to create and reposition dummies
df = get_dummies_and_reposition(df)
df.info(verbose=True,show_counts=True)

Encoded data correlation heatmap

In [None]:
plt.figure(figsize=(70,60))
sns.heatmap(df.corr(numeric_only=True), annot=True, annot_kws={"size": 6}, cmap="YlGnBu")
save_fig('Heat Map')

Writing encoded dataset to file

In [None]:
df_numeric = df.to_csv('/main/tobaccoFree/data/encoded_data.csv', index=False, sep=',', encoding='utf-8')

Sweetviz library plots

In [None]:
report = sv.analyze(df)
report.show_html('analyze.html', open_browser = False)

#### KNN Imputing Null Values

In [None]:
from sklearn.impute import  KNNImputer

iknn = KNNImputer(n_neighbors = 3)



df_numeric = pd.DataFrame(iknn.fit_transform(df.select_dtypes('number')), columns=df.select_dtypes('number').columns)





df_numeric.info(verbose=True,show_counts=True)

Imputed sweetviz plots

In [None]:
report_analyze = sv.analyze(df)
report_analyze.show_html('analyze1.html', open_browser = False)

Sweetviz target variable plots

In [None]:
report = sv.analyze([df, 'Original'], target_feat='tobaccoFree')

report.show_html('analyze2.html', open_browser = False)

Imputed data heatmap with absolute value correlation 

In [None]:
plt.figure(figsize=(80,60))
sns.heatmap(abs(df.corr(numeric_only=True)), annot=True, annot_kws={"size": 6}, cmap="YlGnBu")
save_fig('Heat Map_imputed')

Frequency histogram for imputed data

In [None]:
df.hist(figsize = (30,20))
save_fig("tfs_bar_plot")  
plt.show()

Writing imputed dataset to file

In [None]:
df_numeric = df_numeric.to_csv('/main/tobaccoFree/data/imputed_data.csv', index=False, sep=',', encoding='utf-8')