In [3]:
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

In [4]:
###Task 1: Import the adult dataset

  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets #income
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 
print(y)

ConnectionError: Error connecting to server

In [None]:
print(X.shape)

In [None]:
# #concatenate x and y horizontally (along columns)
# concatenated_data = pd.concat([X, y], axis = 1)
# concatenated_data

In [None]:
###Task 2:
print(X.head)
print("------------------------------------------------------------------------------")
print(X.shape)
print("------------------------------------------------------------------------------")
print(X.info)
print("------------------------------------------------------------------------------")
print(X.describe)
print("------------------------------------------------------------------------------")


In [None]:
###Task 2.1: Plot a histogram of the data

X.hist(figsize=(24, 16))
plt.show()


In [None]:
### Task 3: check for the number of these missing values 
missing_values_count = X.isna().sum().sum()

print(f"Total number of missing values: {missing_values_count}")

missing_values_by_feature = X.apply(lambda x: (x == '?').sum())

print(missing_values_by_feature)


In [None]:
###Task 4: Replace the missing values with null.

# Make a copy of the DataFrame
X = X.copy()

# Replace missing values represented as '?' with NaN in the copied DataFrame
X.replace('?', np.nan, inplace=True)

# Check non-null count using X_copy.info()
X.info()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
###Task 5: Create and apply a preprocessing pipeline

# Define the numeric and categorical columns
num_cols = X.select_dtypes(include='number').columns.to_list()
cat_cols = X.select_dtypes(exclude='number').columns.to_list()

# Exclude the target from numerical columns
# cat_cols.remove("income")

# Create pipelines for numeric and categorical columns
num_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())


#Preprocessing for numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Create and apply the preprocessing pipeline
X_prepared = preprocessor.fit_transform(X)

preprocessor


In [None]:
num_cols

In [None]:
cat_cols

In [None]:

# Assuming adult_prepared is a NumPy array or a Pandas DataFrame
feature_names=preprocessor.get_feature_names_out()
adult_prepared = pd.DataFrame(data=adult_prepared, columns=feature_names)
adult_prepared

In [None]:
print("X_prepared.shape:", X_prepared.shape)

In [None]:
###Task 6: Check the target value_counts. You will notice that the target needs some data cleaning

target_value_counts = y.value_counts()

# Display the current target value counts
print("Current Target Value Counts:")
print(target_value_counts)


In [None]:
###Task 7: Remove the period at the end of the >50K. and <=50K.

# Create a new DataFrame with the modified values
y_cleaned = y.copy()
y_cleaned['income'] = y_cleaned['income'].str.strip('. ')

# After removing the period, recheck the target value counts
target_value_counts = y_cleaned['income'].value_counts()

# Display the cleaned target value counts
print("\nCleaned Target Value Counts:")
print(target_value_counts)



In [None]:
###Task 8: Split the data into 80% training set and 20% testing set, print the shape of X_train, X_test, y_train, y_test in one command.

from sklearn.model_selection import train_test_split

# Split the data into an 80% training set and a 20% testing set
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y_cleaned, test_size=0.2, random_state=42)

# Print the shapes of X_train, X_test, y_train, and y_test
X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [None]:
###Task 9: Train a svm model (svc) to predict if the income of the adult exceeds 50K on the training set
 
from sklearn.svm import SVC

# Create and train the SVM model

model_svm = SVC(kernel='poly', C=0.1, gamma=1)
model_svm.fit(X_train[:10000],y_train[:10000].values.ravel())

In [None]:

###Task 9.1: Test your model on the X_Test, and report the classification_report on the y_test and y_predict.

from sklearn.metrics import classification_report

# Use the trained SVM model to make predictions on the X_test data
y_predict = model_svm.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_predict)

# Print the classification report
print(report)

###Task 9.2: Display the confusion matrix of your test results


from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# Calculate the confusion matrix
# conf_matrix = confusion_matrix(y_test, y_predict)

# # Get the unique classes from the 'income' column in y_test DataFrame
# unique_classes = y_test['income'].unique()

# # Display the confusion matrix with the correct labels
# disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=unique_classes)
# disp.plot(cmap='Blues')

ConfusionMatrixDisplay.from_predictions(y_test, y_predict)

In [None]:
###Task 10: Use GridSearchCV to find the best value of kernel, gamma, and C. 

from sklearn.model_selection import GridSearchCV

# Flatten the target variable y using .values.ravel()
y_train_flattened = y_train.values.ravel()

# Define the parameter grid to search
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': [0.1, 1, 10],
    'C': [0.01, 0.1, 1, 10]
}

# Create an SVM classifier
svm_classifier = SVC()

# Create a GridSearchCV object

# Fit the grid search to the data with the flattened y_train
print(X_train[:100].shape)
grid_search.fit(X_train[:10000], y_train_flattened[:10000])

# Get the best combination of parameters
best_params = grid_search.best_params_
best_kernel = best_params['kernel']
best_gamma = best_params['gamma']
best_C = best_params['C']


# Print the best parameters
print("Best kernel:", best_kernel)
print("Best gamma:", best_gamma)
print("Best C:", best_C)

In [None]:
###Task 10.1: Split the dataset into 60% training, 20% validation, and 20% testing.


# Split the data
X_train, X_validation_test, y_train, y_validation_test = train_test_split(X_prepared, y, test_size=0.4, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_validation_test, y_validation_test, test_size=0.5, random_state=42)

print(X_train.shape,y_train.shape, X_validation.shape, y_validation.shape, X_test.shape, y_test.shape)


In [None]:
###Task 10.2: Use the below code snippet to pass the following hyperparameters for theGridSearchCV to find the best ones.

from sklearn.model_selection import GridSearchCV

svm_parameters = {'kernel': ['rbf'],
                   'C': [0.01, 0.1, 1, 10],
                    'gamma': [0.01, 1, 10]
                 }

# Create an SVM classifier
svm = SVC()

# Create a GridSearchCV object
svm_gs = GridSearchCV(estimator = svm,
                      param_grid = svm_parameters)

#svm_gs.fit(X.train.iloc[:10000], y_train.iloc[:10000].values.ravel())
svm_gs.fit(X_train[:100], y_train[:100].values.ravel())


# svm_winner = svm.gs.best_estimator_
# svm_winner.score(X_validation, y_validation)
svm_winner = svm_gs.best_estimator_
svm_winner.score(X_validation, y_validation)


In [None]:
###Task 10.2: Check the svm winner parameters using svm_winner

svm_winner_params = grid_search.best_params_

# Print the winning hyperparameters
print("Winning Hyperparameters for SVM:")
print(svm_winner_params)
