# From Data to Action: Machine Learning Approaches for Predicting Tobacco-Free Policy Implementation in Schools 
## Preprocessing

Loading excel cleaned and combined dataset

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

IMAGES_PATH = Path() / "plots"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
# Read the two dataframes
df = pd.read_csv('/main/tobaccoFree/data/encoded_data.csv')


df.head()

Unnamed: 0,Dist_Akola,Dist_Amravati,Dist_Chandrapur,Dist_Dhule,Dist_Gondia,Dist_Hingoli,Dist_Jalgaon,Dist_Kolhapur,Dist_Latur,Dist_Nagpur,...,parentEduLevl_50:50%,parentEduLevl_Higher Secondary,parentEduLevl_Higher secondary,parentEduLevl_Literate,"parentEduLevl_Primary, secondary",parentEduLevl_illiterate,parentEduLevl_primary,parentEduLevl_secondary,numBlank,tobaccoFree
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16,1.0


Getting Feature Variable Name List to csv

In [None]:
a = pd.read_csv('/main/tobaccoFree/data/attributes.csv', comment='#', header=None)
a

In [None]:
a = a.dropna(axis=1)
a

In [None]:
a = a.T
a.to_csv('/main/tobaccoFree/data/cols.csv', index=False, sep=',', encoding='utf-8', header=False)
a
df.info(verbose=True,show_counts=True)

Replacing with Current Dataset Column Names

In [None]:
cols = pd.read_csv('/main/tobaccoFree/data/cols.csv')
df.columns = cols.columns
df.info(verbose=True,show_counts=True)

Adding tobaccoFree Column

In [None]:
df['tobaccoFree'] = (df['totalCriteria'] > 10).astype(float)
df.info(verbose=True,show_counts=True)


Writing base dataset to file

In [None]:
df = df.to_csv('/main/tobaccoFree/data/base-dataset.csv', index=False, sep=',', encoding='utf-8')

Dropping columns with too many null values or with untrainable values

In [None]:
df.info(verbose=True,show_counts=True)

In [2]:
df = df.drop(['distPerfEdu', 'prcplIncome','MEnroll1','MEnroll2','MEnroll3','FEnroll1','FEnroll2','FEnroll3','totEnroll1','totEnroll2','totEnroll3','POTOC','numMTeach','numFTeach','teachStudRat','MtoFTeachRat','percHH_Has_Toilet','percPassHSC','prcplSchemeName', 'schoolName','prcplSchemeOtherName', 'percCriteria'], axis=1)
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 152 columns):
 #    Column                     Non-Null Count  Dtype  
---   ------                     --------------  -----  
 0    Dist                       611 non-null    object 
 1    Q1                         611 non-null    int64  
 2    Q2                         611 non-null    int64  
 3    Q3                         611 non-null    int64  
 4    Q4                         611 non-null    int64  
 5    Q5A                        611 non-null    int64  
 6    Q6                         611 non-null    int64  
 7    Q7                         611 non-null    int64  
 8    Q8                         611 non-null    int64  
 9    Q9                         611 non-null    int64  
 10   Q10                        611 non-null    int64  
 11   Q11                        611 non-null    int64  
 12   totalCriteria              611 non-null    int64  
 13   totalPop2011               587 no

First correlation Heatmap

In [None]:

plt.figure(figsize=(70,50))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="YlGnBu")
save_fig('Heat Map')


## Feature Engineering

In [None]:
df.info(verbose=True,show_counts=True)

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

def label_encode_object_columns(df):
    """
    Encodes all object-type columns in a DataFrame using label encoding.

    Args:
        df (pd.DataFrame): The DataFrame to encode.

    Returns:
        pd.DataFrame: The DataFrame with encoded object columns.
    """
    object_cols = df.select_dtypes(include=['object']).columns.tolist()
    for col in object_cols:

        # Label encode the imputed column
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

    return df

df = label_encode_object_columns(df)
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 152 columns):
 #    Column                     Non-Null Count  Dtype  
---   ------                     --------------  -----  
 0    Dist                       611 non-null    int64  
 1    Q1                         611 non-null    int64  
 2    Q2                         611 non-null    int64  
 3    Q3                         611 non-null    int64  
 4    Q4                         611 non-null    int64  
 5    Q5A                        611 non-null    int64  
 6    Q6                         611 non-null    int64  
 7    Q7                         611 non-null    int64  
 8    Q8                         611 non-null    int64  
 9    Q9                         611 non-null    int64  
 10   Q10                        611 non-null    int64  
 11   Q11                        611 non-null    int64  
 12   totalCriteria              611 non-null    int64  
 13   totalPop2011               587 no

In [4]:
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 152 columns):
 #    Column                     Non-Null Count  Dtype  
---   ------                     --------------  -----  
 0    Dist                       611 non-null    object 
 1    Q1                         611 non-null    int64  
 2    Q2                         611 non-null    int64  
 3    Q3                         611 non-null    int64  
 4    Q4                         611 non-null    int64  
 5    Q5A                        611 non-null    int64  
 6    Q6                         611 non-null    int64  
 7    Q7                         611 non-null    int64  
 8    Q8                         611 non-null    int64  
 9    Q9                         611 non-null    int64  
 10   Q10                        611 non-null    int64  
 11   Q11                        611 non-null    int64  
 12   totalCriteria              611 non-null    int64  
 13   totalPop2011               587 no

In [3]:
df['sports']

0         Block Level
1      District Level
2         State Level
3      District Level
4                 NaN
            ...      
606    District Level
607     Cluster Level
608     Cluster Level
609     Cluster Level
610    District Level
Name: sports, Length: 611, dtype: object

#### Getting Dummy Variables for Object data types

In [None]:
def get_dummies_and_reposition(df, prefix_sep='_'):
  """
  Creates dummy variables for object-type columns, inserts them after the original columns, and drops the originals.

  Args:
    df: The pandas DataFrame.
    prefix_sep: The separator to use between the column name and the category name in the dummy variable names.

  Returns:
    The modified DataFrame with dummy variables inserted and original object-type features dropped.
  """
  object_cols = df.select_dtypes(include=['object']).columns.tolist()
  for col in object_cols:
    dummies = pd.get_dummies(df[col], dtype=float, prefix=col, prefix_sep=prefix_sep)
    # Get the index of the original column
    col_index = df.columns.get_loc(col)
    # Insert dummies after the original column
    df = pd.concat([df.iloc[:, :col_index + 1], dummies, df.iloc[:, col_index + 1:]], axis=1)
    # Drop the original column
    df = df.drop(col, axis=1)
  return df

# Call the function to create and reposition dummies
df = get_dummies_and_reposition(df)
df.info(verbose=True,show_counts=True)

Encoded data correlation heatmap

In [None]:
plt.figure(figsize=(70,60))
sns.heatmap(df.corr(numeric_only=True), annot=True, annot_kws={"size": 6}, cmap="YlGnBu")
save_fig('Heat Map')

Writing encoded dataset to file

In [None]:
df_numeric = df.to_csv('/main/tobaccoFree/data/l_encoded_data.csv', index=False, sep=',', encoding='utf-8')

Sweetviz library plots

In [None]:
report = sv.analyze(df)
report.show_html('analyze.html', open_browser = False)

In [2]:
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 247 columns):
 #    Column                              Non-Null Count  Dtype  
---   ------                              --------------  -----  
 0    Dist_Akola                          611 non-null    float64
 1    Dist_Amravati                       611 non-null    float64
 2    Dist_Chandrapur                     611 non-null    float64
 3    Dist_Dhule                          611 non-null    float64
 4    Dist_Gondia                         611 non-null    float64
 5    Dist_Hingoli                        611 non-null    float64
 6    Dist_Jalgaon                        611 non-null    float64
 7    Dist_Kolhapur                       611 non-null    float64
 8    Dist_Latur                          611 non-null    float64
 9    Dist_Nagpur                         611 non-null    float64
 10   Dist_Nanded                         611 non-null    float64
 11   Dist_Nandurbar                

#### KNN Imputing Null Values

In [2]:
from sklearn.impute import  KNNImputer

iknn = KNNImputer(n_neighbors = 3)



df_numeric = pd.DataFrame(iknn.fit_transform(df.select_dtypes('number')), columns=df.select_dtypes('number').columns)





df_numeric.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 247 columns):
 #    Column                              Non-Null Count  Dtype  
---   ------                              --------------  -----  
 0    Dist_Akola                          611 non-null    float64
 1    Dist_Amravati                       611 non-null    float64
 2    Dist_Chandrapur                     611 non-null    float64
 3    Dist_Dhule                          611 non-null    float64
 4    Dist_Gondia                         611 non-null    float64
 5    Dist_Hingoli                        611 non-null    float64
 6    Dist_Jalgaon                        611 non-null    float64
 7    Dist_Kolhapur                       611 non-null    float64
 8    Dist_Latur                          611 non-null    float64
 9    Dist_Nagpur                         611 non-null    float64
 10   Dist_Nanded                         611 non-null    float64
 11   Dist_Nandurbar                

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer

def fill_na_with_mean(df):
    """
    Fills NaN values in each column of a DataFrame with the column's mean.

    Args:
        df (pd.DataFrame): The DataFrame containing NaN values.

    Returns:
        pd.DataFrame: The DataFrame with NaN values filled.
    """

    imputer = SimpleImputer(strategy='mean')
    df_filled = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return df_filled

df = fill_na_with_mean(df)
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 152 columns):
 #    Column                     Non-Null Count  Dtype  
---   ------                     --------------  -----  
 0    Dist                       611 non-null    float64
 1    Q1                         611 non-null    float64
 2    Q2                         611 non-null    float64
 3    Q3                         611 non-null    float64
 4    Q4                         611 non-null    float64
 5    Q5A                        611 non-null    float64
 6    Q6                         611 non-null    float64
 7    Q7                         611 non-null    float64
 8    Q8                         611 non-null    float64
 9    Q9                         611 non-null    float64
 10   Q10                        611 non-null    float64
 11   Q11                        611 non-null    float64
 12   totalCriteria              611 non-null    float64
 13   totalPop2011               611 no

Imputed sweetviz plots

In [4]:
df_numeric = df_numeric.drop(['percCriteria'], axis=1)

In [None]:
report_analyze = sv.analyze(df)
report_analyze.show_html('analyze1.html', open_browser = False)

Sweetviz target variable plots

In [None]:
report = sv.analyze([df, 'Original'], target_feat='tobaccoFree')

report.show_html('analyze2.html', open_browser = False)

Imputed data heatmap with absolute value correlation 

In [None]:
plt.figure(figsize=(80,60))
sns.heatmap(abs(df.corr(numeric_only=True)), annot=True, annot_kws={"size": 6}, cmap="YlGnBu")
save_fig('Heat Map_imputed')

Frequency histogram for imputed data

In [None]:
df.hist(figsize = (30,20))
save_fig("tfs_bar_plot")  
plt.show()

Writing imputed dataset to file

In [4]:
df_numeric = df_numeric.to_csv('/main/tobaccoFree/data/imputed_data.csv', index=False, sep=',', encoding='utf-8')