In [None]:
import pandas as pd

female_names_path = '/content/Indian-Female-Names.csv'
male_names_path = '/content/Indian-Male-Names.csv'

female_names = pd.read_csv(female_names_path)
male_names = pd.read_csv(male_names_path)

female_names['gender'] = 1
male_names['gender'] = 0

female_names.head(), male_names.head()


(              name  gender    race
 0          shivani       1  indian
 1             isha       1  indian
 2  smt shyani devi       1  indian
 3            divya       1  indian
 4            mansi       1  indian,
               name  gender    race
 0          barjraj       0  indian
 1     ramdin verma       0  indian
 2  sharat chandran       0  indian
 3  birender mandal       0  indian
 4             amit       0  indian)

In [None]:
combined_data = pd.concat([female_names[['name', 'gender']], male_names[['name', 'gender']]], ignore_index=True)

combined_data['name'] = combined_data['name'].str.lower()

combined_data.head(), combined_data['gender'].value_counts()


(              name  gender
 0          shivani       1
 1             isha       1
 2  smt shyani devi       1
 3            divya       1
 4            mansi       1,
 gender
 1    15382
 0    14845
 Name: count, dtype: int64)

In [None]:
import nltk
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def extract_char_ngrams(text, n=3):
    if isinstance(text, str):
        ngrams_list = [''.join(gram) for gram in ngrams(text, n)]
        return ' '.join(ngrams_list)
    else:
        return ''

combined_data['name_ngrams'] = combined_data['name'].apply(lambda x: extract_char_ngrams(x, n=3))

In [None]:
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 4))
X = vectorizer.fit_transform(combined_data['name_ngrams'])

In [None]:
y = combined_data['gender'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=['Male', 'Female'])
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, classification_rep, conf_matrix

(0.9369831293417136,
 '              precision    recall  f1-score   support\n\n        Male       0.93      0.94      0.94      2952\n      Female       0.94      0.94      0.94      3094\n\n    accuracy                           0.94      6046\n   macro avg       0.94      0.94      0.94      6046\nweighted avg       0.94      0.94      0.94      6046\n',
 array([[2765,  187],
        [ 194, 2900]]))

In [None]:
def predict_gender(name):
    name = name.lower()
    name_ngrams = extract_char_ngrams(name, n=3)
    name_vec = vectorizer.transform([name_ngrams])
    prediction = clf.predict(name_vec)
    gender = 'Male' if prediction == 0 else 'Female'
    return gender

In [None]:
print(predict_gender('Shreekansha'))   # Male
print(predict_gender('atul'))  # Female

Female
Male
