In [60]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

df = pd.read_csv('..\\data\\yob2013.csv')

# Check data structure
print(df.head())

# Define X and y
y = df['Gender'].values  # Target variable

# One-hot encode 'Name' as it's categorical
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X = encoder.fit_transform(df[['Name']])

# Print X and y shapes to verify
print("X shape:", X.shape)
print("y shape:", y.shape)


       Name Gender  No. of Occurences
0    Sophia      F              21249
1      Emma      F              20963
2    Olivia      F              18448
3  Isabella      F              17673
4       Ava      F              15270
X shape: (28129, 1)
y shape: (28129,)


In [77]:
gender_counts = df['Gender'].value_counts()
print("Number of rows with gender 'F':", gender_counts['F'])

gender_counts = df['Gender'].value_counts()
print("Number of rows with gender 'M':", gender_counts['M'])


Number of rows with gender 'F': 14065
Number of rows with gender 'M': 14064


In [79]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                    test_size = 0.40, 
                                    random_state=1)

X_test.shape

(11252, 2)

In [67]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Define the pipeline
steps = [('knn', KNeighborsClassifier())]

knn_pipeline = Pipeline(steps)

# Define the parameter grid
param_grid = {
    'knn__n_neighbors': [10, 11, 12, 15],
    'knn__weights': ['uniform', 'distance'],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Initialize GridSearchCV
grid_search_knn = GridSearchCV(knn_pipeline, param_grid, cv=10, scoring='accuracy')

# Fit GridSearchCV
grid_search_knn.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search_knn.best_params_)
print("Best cross-validation score: ", grid_search_knn.best_score_)

Best parameters found:  {'knn__algorithm': 'auto', 'knn__n_neighbors': 11, 'knn__weights': 'uniform'}
Best cross-validation score:  0.6294966456622569


In [None]:
# Define the pipeline
steps = [('svc', SVC(kernel='poly', class_weight='balanced'))]
svcPoly_pipeline = Pipeline(steps)

# Define the parameter grid
param_grid_poly = {
    'svc__C': [0.1, 1, 10],
    'svc__degree': [2, 3, 4, 5]
}

# Initialize GridSearchCV
grid_search_poly = GridSearchCV(svcPoly_pipeline, param_grid_poly, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV
grid_search_poly.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters for polynomial kernel: ", grid_search_poly.best_params_)
print("Best cross-validation score for polynomial kernel: ", grid_search_poly.best_score_)


In [9]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


In [58]:
# Define the pipeline
steps = [('svc', SVC(kernel='rbf', class_weight='balanced', probability=True))]
svcL_pipeline = Pipeline(steps)

# Define the parameter grid
param_grid_rbf = {
    'svc__C': [0.1, 1, 2, 3, 4, 5],
    'svc__gamma': ['scale', 'auto']
}

# Initialize GridSearchCV
grid_search_rbf = GridSearchCV(svcL_pipeline, param_grid_rbf, cv=3, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV
grid_search_rbf.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters for rbf kernel: ", grid_search_rbf.best_params_)
print("Best cross-validation score for rbf kernel: ", grid_search_rbf.best_score_)

Best parameters for rbf kernel:  {'svc__C': 1, 'svc__gamma': 1}
Best cross-validation score for rbf kernel:  0.5856487976127787


In [63]:
#--------------------------------------------------
## Model Evaluation ## 
#--------------------------------------------------
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


ypred_test = grid_search_knn.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

ypred_testP = grid_search_knn.predict_proba(X_test)
auc = roc_auc_score(y_test, ypred_testP[:,1])
print(auc)

[[3542 2143]
 [1910 3657]]
              precision    recall  f1-score   support

           F       0.65      0.62      0.64      5685
           M       0.63      0.66      0.64      5567

    accuracy                           0.64     11252
   macro avg       0.64      0.64      0.64     11252
weighted avg       0.64      0.64      0.64     11252

0.6892637525536445


In [64]:
ypred_train = grid_search_knn.predict(X_train)
mat_clf = confusion_matrix(y_train, ypred_train)
report_clf = classification_report(y_train, ypred_train)

print(mat_clf)
print(report_clf)

ypred_trainP = grid_search_knn.predict_proba(X_train)
auc = roc_auc_score(y_train, ypred_trainP[:,1])
print(auc)

[[5811 2569]
 [2338 6159]]
              precision    recall  f1-score   support

           F       0.71      0.69      0.70      8380
           M       0.71      0.72      0.72      8497

    accuracy                           0.71     16877
   macro avg       0.71      0.71      0.71     16877
weighted avg       0.71      0.71      0.71     16877

0.7800508841671764


GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

MultinomialNB

In [100]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=0)
bnb.fit(X_train, y_train)

In [57]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#--------------------------------------------------
## ------------Logistic Regresion----------------## 
#--------------------------------------------------

from sklearn.linear_model import LogisticRegression

steps = [      
         ('logReg', LogisticRegression(penalty = "l2"))]
LR_pipeline = Pipeline(steps)

# Define the parameter grid for GridSearchCV
param_grid = {
    'logReg__C': [0.01, 0.0001, 1.0, 0.001],  # Regularization strength
    'logReg__solver': ['liblinear', 'saga'],  # Solvers for small and large datasets
    'logReg__max_iter': [100, 200, 300]  # Number of iterations
}

# Initialize GridSearchCV
grid_search_LR = GridSearchCV(LR_pipeline, param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV on the training data
grid_search_LR.fit(X, y)

# Print the best parameters and the best score
print("Best parameters for logistic regression: ", grid_search_LR.best_params_)
print("Best cross-validation score for logistic regression: ", grid_search_LR.best_score_)

Best parameters for logistic regression:  {'logReg__C': 0.001, 'logReg__max_iter': 100, 'logReg__solver': 'liblinear'}
Best cross-validation score for logistic regression:  0.5353206967650197


In [58]:
#--------------------------------------------------
## Model Evaluation ## 
#--------------------------------------------------
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


ypred_test = grid_search_LR.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

ypred_testP = grid_search_LR.predict_proba(X_test)
auc = roc_auc_score(y_test, ypred_testP[:,1])
print(auc)

[[1024 4661]
 [ 728 4839]]
              precision    recall  f1-score   support

           F       0.58      0.18      0.28      5685
           M       0.51      0.87      0.64      5567

    accuracy                           0.52     11252
   macro avg       0.55      0.52      0.46     11252
weighted avg       0.55      0.52      0.46     11252

0.5213409084410126


In [59]:
ypred_train = grid_search_LR.predict(X_train)
mat_clf = confusion_matrix(y_train, ypred_train)
report_clf = classification_report(y_train, ypred_train)

print(mat_clf)
print(report_clf)

ypred_trainP = grid_search_LR.predict_proba(X_train)
auc = roc_auc_score(y_train, ypred_trainP[:,1])
print(auc)

[[1615 6765]
 [1101 7396]]
              precision    recall  f1-score   support

           F       0.59      0.19      0.29      8380
           M       0.52      0.87      0.65      8497

    accuracy                           0.53     16877
   macro avg       0.56      0.53      0.47     16877
weighted avg       0.56      0.53      0.47     16877

0.5394556284500805
