Clasificación de Genero por Nombres
Usando machine learning para detectar / predecir el genero en base al nombre
Sklearn
Pandas
Text Extraction

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
df = pd.read_csv('names_dataset.csv')

In [6]:
df.head()

Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [7]:
df.size

285075

In [8]:
#Limpieza de datos
# validamos la consistencia en los nombres de las columnas
df.columns

Index(['index', 'name', 'sex'], dtype='object')

In [10]:
#data types
df.dtypes

index     int64
name     object
sex      object
dtype: object

In [11]:
#validando valores nulos
df.isnull().isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [12]:
#numero de nombres mujeres
df[df.sex == 'F'].size

181800

In [13]:
#numero de nombres hombres
df[df.sex == 'M'].size

103275

In [14]:
df_names = df

In [15]:
# reemplazando todas las F y M con 0 y 1 respectivamente
df_names.sex.replace({'F':0, 'M':1}, inplace=True)


In [16]:
df_names.sex.unique()

array([0, 1], dtype=int64)

In [17]:
df_names.dtypes

index     int64
name     object
sex       int64
dtype: object

In [18]:
Xfeatures = df_names['name']

In [19]:
#extracción de caracteristicas
cv = CountVectorizer()
x = cv.fit_transform(Xfeatures)


In [21]:
cv.get_feature_names_out()

array(['aaban', 'aabha', 'aabid', ..., 'zyyanna', 'zyyon', 'zzyzx'],
      dtype=object)

In [22]:
from sklearn.model_selection import train_test_split

In [25]:
#Caracteristicas
x
#etiquetas
y = df_names.sex

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [29]:
#clasificador naive bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.6398163206734908

In [30]:
# Rendimiento del Modelo
print("Rendimiento del Modelo", clf.score(x_test, y_test)*100,"%")

Rendimiento del Modelo 63.98163206734908 %


In [31]:
print("Rendimiento del Modelo", clf.score(x_train, y_train)*100,"%")

Rendimiento del Modelo 100.0 %


In [32]:
#Ejemplo Predicción
sample_name = ["Mary"]
vect = cv.transform(sample_name).toarray()

In [33]:
vect

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
#Mujer es 0, Hombre es 1
clf.predict(vect)

array([0], dtype=int64)

In [35]:
#Ejemplo 2 predicción
sample_name1 = ["Mark"]
vect1 = cv.transform(sample_name1).toarray()

In [36]:
clf.predict(vect1)

array([1], dtype=int64)

In [37]:
# Ejemplo Predicción de nombres Rusos
sample_name2 = ["Natasha"]
vect2 = cv.transform(sample_name2).toarray()

In [38]:
clf.predict(vect2)


array([0], dtype=int64)

In [40]:
# Ejemplo 3 Predicción de nombres aleatorios
sample_name3 = ["Nefertiti","Nasha","Ama","Ayo","Xhavier","Ovetta","Tathiana","Xia","Joseph","Xianliang"]
vect3 = cv.transform(sample_name3).toarray()

In [41]:
clf.predict(vect3)


array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [42]:
# una función para hacer
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [43]:
genderpredictor("Martha")


Female


In [44]:
namelist = ["Yaa","Yaw","Femi","Masha"]
for i in namelist:
    print(genderpredictor(i))

Female
None
Male
None
Female
None
Female
None


In [46]:
# usando una función personalizada para analisis de caracteristicas
# por analogia mas nombres de mujeres comienzan con A y E
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [47]:
# Vectorizando las caracteristicas de la función
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter","John","Vladmir","Mohammed"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'han', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'}
 {'first-letter': 'p', 'first2-letters': 'pe', 'first3-letters': 'pet', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ter'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'v', 'first2-letters': 'vl', 'first3-letters': 'vla', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'mir'}
 {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'}]


In [48]:
# Extrayendo las caracteristicas para el conjunto de datos
df_X = features(df_names['name'])

In [49]:
df_y = df_names['sex']


In [50]:
from sklearn.feature_extraction import DictVectorizer
 
corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)
 

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [52]:
dv.get_feature_names_out()


array(['first-letter=j', 'first-letter=m', 'first2-letters=ju',
       'first2-letters=mi', 'first3-letters=jul', 'first3-letters=mik',
       'last-letter=a', 'last-letter=e', 'last2-letters=ia',
       'last2-letters=ke', 'last3-letters=ike', 'last3-letters=lia'],
      dtype=object)

In [53]:
# Entrenando la muestra de datos
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [54]:
dfX_train

array([{'first-letter': 'e', 'first2-letters': 'el', 'first3-letters': 'ele', 'last-letter': 'a', 'last2-letters': 'ia', 'last3-letters': 'nia'},
       {'first-letter': 'a', 'first2-letters': 'ad', 'first3-letters': 'adi', 'last-letter': 'l', 'last2-letters': 'il', 'last3-letters': 'dil'},
       {'first-letter': 'k', 'first2-letters': 'ka', 'first3-letters': 'kad', 'last-letter': 'e', 'last2-letters': 'ze', 'last3-letters': 'nze'},
       ...,
       {'first-letter': 'j', 'first2-letters': 'ja', 'first3-letters': 'jaz', 'last-letter': 'y', 'last2-letters': 'ly', 'last3-letters': 'zly'},
       {'first-letter': 'e', 'first2-letters': 'el', 'first3-letters': 'elv', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'ina'},
       {'first-letter': 'l', 'first2-letters': 'le', 'first3-letters': 'led', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ger'}],
      dtype=object)

In [55]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<63666x8194 sparse matrix of type '<class 'numpy.float64'>'
	with 381996 stored elements in Compressed Sparse Row format>

In [56]:
# Construyendo el modelo usando arbol de decisión

from sklearn.tree import DecisionTreeClassifier
 
dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)

In [58]:
# Construyendo caracteristicas y transformando los mismos
sample_name_eg = ["Alex"]
transform_dv =dv.transform(features(sample_name_eg))

In [59]:
vect3 = transform_dv.toarray()


In [60]:
# Prediciendo el genero
# hombre is 1,mujer = 0
dclf.predict(vect3)

array([1], dtype=int64)

In [61]:
if dclf.predict(vect3) == 0:
    print("Female")
else:
    print("Male")

Male


In [62]:
# Segunda predicción con nombres nigerianos
name_eg1 = ["Chioma"]
transform_dv =dv.transform(features(name_eg1))
vect4 = transform_dv.toarray()
if dclf.predict(vect4) == 0:
    print("Female")
else:
    print("Male")

Female


In [63]:
# Una función para crear
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [64]:
random_name_list = ["Alex","Alice","Chioma","Vitalic","Clairese","Chan"]


In [65]:
for n in random_name_list:
    print(genderpredictor1(n))

Male
None
Female
None
Female
None
Female
None
Female
None
Male
None


In [66]:
## Rendimiento de el modelo de clasficación con arbol de decisión, clasificando mejor el trabajo que con naive bayes
# Rendimiento del conjunto de datos de entrenamiento
print(dclf.score(dv.transform(dfX_train), dfy_train)) 

0.9888951716771903


In [67]:
# Rendimiento del conjunto de datos de entrenamiento
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.8670238209126566


In [70]:
#guardando nuestro modelo
import joblib

In [71]:
decisiontreModel = open("decisiontreemodel.pkl","wb")


In [72]:
joblib.dump(dclf,decisiontreModel)


In [73]:
decisiontreModel.close


<function BufferedWriter.close>

In [74]:
#alternativa para guardar el modelo
import pickle
dctreeModel = open("namesdetectormodel.pkl","wb")

In [76]:
pickle.dump(dclf,dctreeModel)

In [77]:
dctreeModel.close()


In [78]:
#Guardando el modelo Naive Bayes Multinomial 
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")
joblib.dump(clf,NaiveBayesModel)
NaiveBayesModel.close()