In [22]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Data Loading and Preprocessing:

- **Data Preparation:**
  - The dataset `income-data.csv` is loaded and relevant columns are selected.
  - Extra space in column names is deleted.
  - Duplicates are deleated.
  - Categorical variables (`workclass`, `education`, `occupation`) are encoded using `LabelEncoder`.
  
- **Data Splitting:**
  - The dataset is split into training and testing sets using `train_test_split`.
  
- **Feature Scaling:**
  - Features are standardized using `StandardScaler` to ensure all features contribute equally to the model.

In [2]:
data = pd.read_csv('income-data.csv')
data = data[['age', ' workclass',' education',' marital-status', ' occupation',' hours-per-week',' native-country', ' income']]
data.columns = [['age', 'workclass','education','marital-status', 'occupation','hours-per-week','native-country', 'income']]
data.drop_duplicates(inplace=True)

In [3]:
data.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,hours-per-week,native-country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,40,Cuba,<=50K


In [4]:
labelencoder = preprocessing.LabelEncoder()

In [6]:
# Encode categorical variables
data['workclass'] = labelencoder.fit_transform(data['workclass'])
data['education'] = labelencoder.fit_transform(data['education'])
data['marital-status'] = labelencoder.fit_transform(data['marital-status'])
data['occupation'] = labelencoder.fit_transform(data['occupation'])
data['native-country'] = labelencoder.fit_transform(data['native-country'])
data['income'] = labelencoder.fit_transform(data['income'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [7]:
data.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,hours-per-week,native-country,income
0,39,7,9,4,1,40,39,0
1,50,6,9,2,4,13,39,0
2,38,4,11,0,6,40,39,0
3,53,4,1,2,6,40,39,0
4,28,4,9,2,10,40,5,0


In [8]:
# Define features (x) and target (y)
x = data[['age', 'workclass','education','marital-status', 'occupation','hours-per-week','native-country']]
y = data [['income']]

In [9]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30)

In [10]:
# Standardize features
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.fit_transform(x_test)

In [11]:
def modelresults(predictions):
    print(f"Accuracy score of the model is {accuracy_score(y_test,predictions)}.")

# Model Training and Evaluation

## Logistic Regression
 - A `LogisticRegression` model is instantiated.
  - The scaled training data (`scaled_x_train`) and target (`y_train`) are used to fit the model.
  
- **Model Evaluation:**
  - Predictions are made on the scaled test data (`scaled_x_test`).
  - The accuracy score of the model is computed using `accuracy_score` and printed.

In [12]:
# Logistic Regression model
log_model = LogisticRegression()
log_model.fit(scaled_x_train, y_train)

  y = column_or_1d(y, warn=True)


In [15]:
log_predictions = log_model.predict(scaled_x_test)
modelresults(log_predictions)

Accuracy score of the model is 0.7330614392843047


## K Nearest Neighbors (KNN) 
K Nearest Neighbors is a non-parametric model used for classification. The training and evaluation process with grid search:

- **Model Setup:**
  - A `KNeighborsClassifier` is initialized.
  
- **Hyperparameter Tuning:**
  - A pipeline (`Pipeline`) is created with scaling and KNN.
  - Grid search (`GridSearchCV`) is used to find the optimal number of neighbors (`k`) through cross-validation (`cv=5`).
  
- **Grid Search Execution:**
  - The best `k` value is determined based on the highest accuracy score during cross-validation.
  
- **Model Training with Best Parameters:**
  - The optimal KNN model is re-trained using the scaled training data (`scaled_x_train`).
  
- **Model Evaluation:**
  - Predictions are made on the scaled test data (`scaled_x_test`) using the re-trained model.
  - The accuracy score of the model is computed and printed.

In [16]:
# K Nearest Neighbors model with Grid Search

knn = KNeighborsClassifier()
# Define the range of k values
k_values = list(range(1, 30))

# Create a pipeline for KNN with scaling
pipe = Pipeline(steps=[("knn", knn)])

# Define parameters grid for GridSearchCV
param_grid = {"knn__n_neighbors": k_values}

# Perform Grid Search Cross-Validation
cv_classifier = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
cv_classifier.fit(scaled_x_train, y_train.values.ravel())

  _data = np.array(data, dtype=dtype, copy=copy,


In [17]:
pred_gridknn = cv_classifier.predict(scaled_x_test)
modelresults(pred_gridknn)

Accuracy score of the model is 0.7820023681094593


### Support Vector Machine (SVM) Model with Grid Search

Support Vector Machine (SVM) is a powerful classification algorithm that works well for both linear and non-linear data. In this script, we use Grid Search to optimize the hyperparameters of the SVM model. The detailed process:

- **Initialize SVM:**
  - An instance of `SVC` (Support Vector Classifier) is created.

- **Define Hyperparameters:**
  - A parameter grid is defined for Grid Search:
    - `C`: Regularization parameter with values [0.1, 0, 1].
    - `kernel`: Kernel types to be tested: linear, radial basis function (rbf), and polynomial (poly).

- **Perform Grid Search:**
  - Grid Search (`GridSearchCV`) is employed to search for the best combination of hyperparameters.
  - The search is conducted using 5-fold cross-validation to ensure robustness.
  - The Grid Search object is fitted with the scaled training data (`scaled_x_train`, `y_train.values.ravel()`).

- **Model Selection:**
  - The best combination of hyperparameters is selected based on the highest cross-validation accuracy.
  

In [20]:
svm = SVC()
param_grid_svc = {"C":[0.1,0.,1],"kernel":["linear","rbf","poly"]}
gridsvr = GridSearchCV(svm,param_grid_svc)
gridsvr.fit(scaled_x_train,y_train.values.ravel())

15 fits failed out of a total of 45.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lucia\Desktop\Desktop\Portfolio\Data Science Projects\data-science-projects\pyenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lucia\Desktop\Desktop\Portfolio\Data Science Projects\data-science-projects\pyenv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\lucia\Desktop\Desktop\Portfolio\Data Science Projects\data-science-projects\pyenv\Lib\site-packages\sklearn\base.py", line 666, in _validate_pa

In [21]:
pred_svr = gridsvr.predict(scaled_x_test)
modelresults(pred_svr)

Accuracy score of the model is 0.77831864228391


## Random Forest

Random Forest is an ensemble learning method for classification. It's trained and evaluated with grid search:

- **Model Setup:**
  - A `RandomForestClassifier` is instantiated.
  
- **Hyperparameter Tuning:**
  - Grid search (`GridSearchCV`) is applied to find the optimal combination of `n_estimators`, `max_features`, `bootstrap`, and `oob_score`.
  
- **Grid Search Execution:**
  - The best combination of hyperparameters is determined based on the highest accuracy score during cross-validation.
  
- **Model Training with Best Parameters:**
  - The optimal Random Forest model is re-trained using the scaled training data (`scaled_x_train`).
  
- **Model Evaluation:**
  - Predictions are made on the scaled test data (`scaled_x_test`) using the re-trained model.
  - The accuracy score of the model is computed and printed.

In [24]:
# Random Forest model with Grid Search
rfr_model = RandomForestClassifier()

# Define ranges for hyperparameters
n_estimators = [32,64,128]
max_features = [2,3]
bootstrap = [True, False]
oob_score = [True, False]

# Define parameter grid for GridSearchCV
param_grid_rfr = {"n_estimators":n_estimators, "max_features": max_features, "bootstrap":bootstrap, "oob_score":oob_score}

# Perform Grid Search Cross-Validation
grid_rfr = GridSearchCV(rfr_model, param_grid_rfr)
grid_rfr.fit(scaled_x_train,y_train.values.ravel())

30 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lucia\Desktop\Desktop\Portfolio\Data Science Projects\data-science-projects\pyenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lucia\Desktop\Desktop\Portfolio\Data Science Projects\data-science-projects\pyenv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lucia\Desktop\Desktop\Portfolio\Data Science Projects\data-science-projects

In [25]:
pred_rfc = grid_rfr.predict(scaled_x_test)
modelresults(pred_rfc)

Accuracy score of the model is 0.7634521773450862


#### Predicting New Data Entry

The KNN model was chosen because it achieved the highest accuracy score of 0.78 during the evaluation.

After training the model, it can be used to predict the class of new data entries.

- **Define Prediction Function:**
  - A function `prediction` is defined to predict the class of a new data entry.
  - The function takes a new data entry, uses the trained KNN model to predict its class, and returns a human-readable result.

- **New Entry Vector:**
  - A new data entry (`newentry_vector`) is defined as a NumPy array with the appropriate feature values.
  - The new entry is standardized using the same `StandardScaler` used for the training data.

- **Make Prediction:**
  - The standardized new entry is passed to the `prediction` function.
  - The predicted class of the new entry is printed.


In [28]:
data.mean().round()

age               40.0
workclass          4.0
education         10.0
marital-status     3.0
occupation         7.0
hours-per-week    41.0
native-country    36.0
income             0.0
dtype: float64

In [30]:
def prediction(newentry):
    pred = cv_classifier.predict(newentry)
    if pred == 0:
        return "Below or equal to 50K"
    elif pred == 1:
        return "More than 50K"
    else:
        return "There was an error in the prediction."

In [33]:
newentry_vector = np.array([40,4,10,3,7,41,36]).reshape(1,-1)
newentry = scaler.fit_transform(newentry_vector)
print("Income prediction of the new index is: {}".format(prediction(newentry)))

Income prediction of the new index is: Below or equal to 50K
