<a href="https://colab.research.google.com/github/makhmudov-khondamir/Machine-Learning-Projects/blob/main/16.%20Tabular%20vs%20Scikit-Learn%3A%20Prediction%20project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Comparative Analysis of Fastai (Tabular) and Scikit-Learn for Salary Prediction: From Data Preprocessing to Model Evaluation**

The issue is how to effectively predict whether a salary is greater than or equal to 50,000 or less than 50,000 based on features using different machine learning frameworks. We will compare the use of Fastai (Tabular) and Scikit-Learn

**Note!**
For Fastai, GPU Environments is required.
For Scikit-Learn, CPU Environments is enough

## **Scikit_Learn**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
import pandas as pd
# import fastai
# import torch
# from fastai.tabular.all import *  ! Write these codes, if you are using a GPU environment


# Load and preprocess data
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

# Replace salary labels
df['salary'].replace({'<50k': 0, '>=50k': 1}, inplace=True)

# List of columns to apply the strip function
cols_to_strip = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

# Apply strip function to each specified column
for col in cols_to_strip:
    df[col] = df[col].map(lambda x: x.strip())

# Define categorical and numerical columns
categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
numerical = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

# Features and target
x = df.drop('salary', axis=1)
y = df['salary']

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Define pipelines
pipcat = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

pipnum = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std', StandardScaler())
])

# Combine into a full pipeline
fullpipeline = ColumnTransformer([
    ('cat', pipcat, categorical),
    ('num', pipnum, numerical)
])

# Transform data
x_train_transformed = fullpipeline.fit_transform(x_train)
x_test_transformed = fullpipeline.transform(x_test)

# Initialize and fit the RandomForest model
forest = RandomForestClassifier()
RF_model = forest.fit(x_train_transformed, y_train)

# Make predictions on the test set
y_pred = RF_model.predict(x_test_transformed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy:", accuracy)

# Testing
new=df.loc[[0]].drop('salary',axis=1)
test=fullpipeline.transform(new)
a=RF_model.predict(test)
if a==0:
    print('the salary is less than 50k')
else:
    print('the salary is more than or equal to 50k')

## **Fastai, Tabular**

In [None]:
from fastai.tabular.all import *

# Load and preprocess data
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

# Replace salary labels
df['salary'].replace({'<50k': 0, '>=50k': 1}, inplace=True)

# Define categorical and numerical columns
categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
numerical = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

# Create DataLoaders
dataloaders = TabularDataLoaders.from_df(df, path, y_names='salary', bs=64, skipinitialspace=False, cat_names=categorical, cont_names=numerical, procs=[Categorify, FillMissing, Normalize])

# Define and train the model
learn = tabular_learner(dataloaders, metrics=accuracy)
learn.fit_one_cycle(5)
# Testing
row, pred, probs = learn.predict(df.iloc[4])
pred.item(), probs