<a href="https://colab.research.google.com/github/nilanahar/Binary_Classification_Using_Decision_Tree_Model/blob/main/869_ML_AI_A1_(3_2)_SimpleImputation_OrdinalEncoding_CV_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import packages
import datetime
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
import os

#### DATA PREP ####

# Load the dataset
url = "https://drive.google.com/uc?export=download&id=1eYCKuqJda4bpzXBVnqXylg0qQwvpUuum"
df = pd.read_csv(url)

In [None]:
#### EXPLORATORY DATA ANALYSIS ####

target_column = 'h1n1_vaccine'

# Function to generate summary of the dataset
def generate_summary(df):
    numerical_features_summary = df.describe().T
    unique_counts_num = df.select_dtypes(include=[np.number]).nunique()
    numerical_summary = numerical_features_summary[['count']].copy()
    numerical_summary['unique'] = unique_counts_num
    numerical_summary['nulls'] = df.select_dtypes(include=[np.number]).isnull().sum()
    numerical_summary['has_nulls'] = numerical_summary['nulls'] > 0
    numerical_summary = numerical_summary.reset_index()
    numerical_summary['type'] = 'numerical'

    categorical_features_summary = df.describe(include='object').T
    unique_counts_cat = df.select_dtypes(include=['object']).nunique()
    categorical_summary = categorical_features_summary[['count']].copy()
    categorical_summary['unique'] = unique_counts_cat
    categorical_summary['nulls'] = df.select_dtypes(include=['object']).isnull().sum()
    categorical_summary['has_nulls'] = categorical_summary['nulls'] > 0
    categorical_summary = categorical_summary.reset_index()
    categorical_summary['type'] = 'categorical'

    combined_summary = pd.concat([numerical_summary, categorical_summary], ignore_index=True)
    combined_summary = combined_summary.rename(columns={'index': 'features'})
    combined_summary.loc[combined_summary['features'] == target_column, 'type'] = 'target'
    return combined_summary

summary = generate_summary(df)

categorical_features = summary.query('type == "categorical"')['features'].tolist()
numerical_features = summary.query('type == "numerical" and features != "h1n1_vaccine"')['features'].tolist()
summary

Unnamed: 0,features,count,unique,nulls,has_nulls,type
0,h1n1_concern,21292,4,73,True,numerical
1,h1n1_knowledge,21274,3,91,True,numerical
2,behavioral_antiviral_meds,21306,2,59,True,numerical
3,behavioral_avoidance,21202,2,163,True,numerical
4,behavioral_face_mask,21351,2,14,True,numerical
5,behavioral_wash_hands,21329,2,36,True,numerical
6,behavioral_large_gatherings,21293,2,72,True,numerical
7,behavioral_outside_home,21306,2,59,True,numerical
8,behavioral_touch_face,21263,2,102,True,numerical
9,doctor_recc_h1n1,19629,2,1736,True,numerical


In [None]:
#### SPLIT INTO FEATURES AND TARGET VARIABLE ####

X = df.drop(target_column, axis=1)
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20170406)

In [None]:
## DATA CLEANING & PREPROCESSING ##

# Define preprocessing pipelines
numerical_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean'))])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Apply imputation and encoding
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=numerical_features + categorical_features)
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=numerical_features + categorical_features)

In [None]:
#### MODEL BUILDING AND EVALUATION ####

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

# Define a pipeline with feature scaling and Random Forest classifier
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Optional: It depends on your specific dataset
    ('classifier', rf_model)  # Classifier step
])

# Define the cross-validation scorer
scorer = make_scorer(f1_score, average='macro')

# Perform cross-validation
cv_scores = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring=scorer, n_jobs=-1)

# Print cross-validation scores
print(f"Cross-validation F1 scores: {cv_scores}")
print(f"Mean cross-validation F1 score: {np.mean(cv_scores)}")

Cross-validation F1 scores: [0.74702915 0.72820278 0.72881321 0.72257622 0.73683849]
Mean cross-validation F1 score: 0.7326919679412938


In [None]:
# Fit the model on the entire training set
rf_pipeline.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred_test = rf_pipeline.predict(X_test)
test_f1_score = f1_score(y_test, y_pred_test, average='macro')

# Print the test (leaderboard) score
print("The test F1 score with the Random Forest model is:")
print(test_f1_score)


The test F1 score with the XGBoost model is:
0.7398295431805806


In [None]:
#### PREDICTIONS ON COMPETITION DATA ####

# Load the competition dataset
X_comp = pd.read_csv("https://drive.google.com/uc?export=download&id=1SmFBoNh7segI1Ky92mfeIe6TpscclMwQ")

# Perform the same cleaning/transformation steps on the competition data
X_comp = pd.DataFrame(preprocessor.transform(X_comp), columns=numerical_features + categorical_features)

# Use your model to make predictions
pred_comp = rf_pipeline.predict(X_comp)

my_submission = pd.DataFrame({'predicted': pred_comp})

# Let's take a peek at the results (as a sanity check)
print(my_submission.head(10))

# Specify the directory and filename
directory = '/Users/nilanahar/Downloads'  # Replace with your directory path
filename = 'my_submission_(SN_3).csv'

# Ensure the directory exists
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the submission to CSV
my_submission.to_csv(os.path.join(directory, filename), index=False)

   predicted
0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          1
