# Problem Statement:
- We need you to build a model that, given Indian names, can predict their gender. 
- You can use any training dataset for this that you can find.

# Suggested Solutions:
- Create LSTM Model to predict gender based on name(Indian name).

# Code: 

In [1]:
# loading the modules:

import torch
import torch.nn as nn
from torchtext.vocab import vocab
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader

import pandas as pd 

from collections import Counter

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import  confusion_matrix

In [2]:
# Loading males names data:
males_names_data = pd.read_csv(r'https://gist.githubusercontent.com/mbejda/7f86ca901fe41bc14a63/raw/38adb475c14a3f44df9999c1541f3a72f472b30d/Indian-Male-Names.csv')

In [3]:
males_names_data.shape

(14845, 3)

In [4]:
males_names_data.columns

Index(['name', 'gender', 'race'], dtype='object')

In [5]:
males_names_data.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin verma,m,indian
2,sharat chandran,m,indian
3,birender mandal,m,indian
4,amit,m,indian


In [6]:
# Loading females names data:
females_names_data =pd.read_csv(r'https://gist.githubusercontent.com/mbejda/9b93c7545c9dd93060bd/raw/b582593330765df3ccaae6f641f8cddc16f1e879/Indian-Female-Names.csv')

In [7]:
females_names_data.shape

(15382, 3)

In [8]:
females_names_data.columns

Index(['name', 'gender', 'race'], dtype='object')

In [9]:
females_names_data.head()

Unnamed: 0,name,gender,race
0,shivani,f,indian
1,isha,f,indian
2,smt shyani devi,f,indian
3,divya,f,indian
4,mansi,f,indian


In [10]:
# Lets merge both names and create new dataframe:
data = males_names_data.append(females_names_data)

In [11]:
data.shape

(30227, 3)

In [12]:
data.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin verma,m,indian
2,sharat chandran,m,indian
3,birender mandal,m,indian
4,amit,m,indian


In [13]:
data.columns

Index(['name', 'gender', 'race'], dtype='object')

In [14]:
data.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin verma,m,indian
2,sharat chandran,m,indian
3,birender mandal,m,indian
4,amit,m,indian


In [15]:
data.tail()

Unnamed: 0,name,gender,race
15377,saroj devi,f,indian
15378,naina @ geeta,f,indian
15379,manju d/0 baboo lal jatav,f,indian
15380,shivani,f,indian
15381,nayna,f,indian


In [16]:
# Checking for null values:
data.isnull().sum()

name      55
gender     0
race       0
dtype: int64

- since we have null values we need to remove them.

In [17]:
# Removing null values:
data.dropna(inplace=True)

In [18]:
data.isnull().sum()

name      0
gender    0
race      0
dtype: int64

# Small EDA

In [19]:
import plotly.express as px
gender = data.gender.value_counts()
fig = px.pie(data, values=gender.values, names=gender.index, title='Distribution of Gender')
fig.show()

In [20]:
name = data.name.value_counts()
name

pooja                         353
jyoti                         201
poonam                        148
deepak                        137
sunita                        136
                             ... 
roshan kumar s/o rajkumar,      1
rakesh chander sharma           1
narsa ram mghwal                1
pannu                           1
nayna                           1
Name: name, Length: 15034, dtype: int64

In [21]:
import plotly.graph_objects as go
fig = go.Figure([go.Bar(x=name.index[:50], y=name.values[:50])])
fig.update_layout(title_text="Top 50 Repeated Names and their count")
fig.show()

In [22]:
# Splitting data into training and testing:
data=data.sample(frac=1)
train_size = int(len(data)*.8)
train_data = data[:train_size]
test_data = data[train_size:]

In [23]:
# # Creating a vocabulary of the characters from all the given names:
all_chars = [t for text  in data['name'] for t in text if text is not None] 
char_count = Counter(all_chars)
name_char_vocab = vocab(char_count)

### Creating a Dataset class for returning name and gender Tensors from the Dattaframe:

In [24]:
class NamesDataset(Dataset):
    
    def __init__(self,data,name_char_vocab):
        self.data=data
        self.name_char_vocab=name_char_vocab
        self.gender_dict = {'m':0, 'f':1}
        self.rev_gender_dict = {v:k for k,v in self.gender_dict.items()}
        
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data.iloc[idx, :]
        label = torch.zeros(2)
        label[self.gender_dict[item['gender']]] = 1
        name = self.get_names_tensor(item['name'])
        return name, torch.tensor(self.gender_dict[item['gender']])
    
    def get_names_tensor(self, name ):
        name_ids = self.name_char_vocab.lookup_indices([t for t in  name])
        name_tensor = torch.as_tensor(name_ids, dtype = int)
        return name_tensor
    
    def get_category_from_idx(self, idx ):
        
        return self.rev_gender_dict[idx]
    
train_ds = NamesDataset(train_data,name_char_vocab)

# Demo:
# train_ds.get_names_tensor('meet')

### Creating the LSTM based model using character embedding:

In [25]:
class NamesClassifier(nn.Module):
    
    def __init__(self, size):
        super(NamesClassifier, self).__init__()
        self.embedding = nn.Embedding(size,128)
        self.rnn = nn.LSTM(128,256)
        self.linear1 = nn.Linear(256,256)
        self.relu1= nn.ReLU()
        self.linear2 = nn.Linear(256,2)
    
    def forward(self, ip):
        op= self.embedding(ip)
        op, hi = self.rnn(op)
        output = self.linear1(hi[0])
        output = self.relu1(output)
        output = self.linear2(output)
        return output

### Define a "predict" Function to predict gender from The name using our trainned LSTM model

In [26]:
def predict(name, model1):
    names_tensor = train_ds.get_names_tensor(name)
    output = model1(names_tensor)
    category_idx = output.topk(1)[1].item()
    category = train_ds.get_category_from_idx(category_idx)
    return category
model = NamesClassifier(len(train_ds.name_char_vocab))

### Train the Model

In [27]:
# We need a loss function as criteria and an optimizer to train our model:

criteria = nn.CrossEntropyLoss()
optimizer =  torch.optim.Adam(model.parameters())
num_step= len(train_ds)
step =0
total_loss=0
for  i in range(0, train_size):
    name_ip, label = train_ds[i]
    step=step+1
    optimizer.zero_grad()
    op= model(name_ip)
    loss = criteria(op.squeeze(), label)
    loss.backward()
    optimizer.step()
    total_loss=loss+total_loss
    if step%1000==0:
        print(total_loss)
        total_loss=0

tensor(534.4256, grad_fn=<AddBackward0>)
tensor(465.8978, grad_fn=<AddBackward0>)
tensor(429.9753, grad_fn=<AddBackward0>)
tensor(360.5829, grad_fn=<AddBackward0>)
tensor(318.8640, grad_fn=<AddBackward0>)
tensor(334.5739, grad_fn=<AddBackward0>)
tensor(304.3307, grad_fn=<AddBackward0>)
tensor(302.2365, grad_fn=<AddBackward0>)
tensor(300.0525, grad_fn=<AddBackward0>)
tensor(262.6337, grad_fn=<AddBackward0>)
tensor(269.7578, grad_fn=<AddBackward0>)
tensor(279.7076, grad_fn=<AddBackward0>)
tensor(290.8575, grad_fn=<AddBackward0>)
tensor(268.1062, grad_fn=<AddBackward0>)
tensor(216.9987, grad_fn=<AddBackward0>)
tensor(289.7292, grad_fn=<AddBackward0>)
tensor(222.4249, grad_fn=<AddBackward0>)
tensor(241.7623, grad_fn=<AddBackward0>)
tensor(246.2922, grad_fn=<AddBackward0>)
tensor(231.8720, grad_fn=<AddBackward0>)
tensor(218.5475, grad_fn=<AddBackward0>)
tensor(231.6078, grad_fn=<AddBackward0>)
tensor(203.1116, grad_fn=<AddBackward0>)
tensor(220.0393, grad_fn=<AddBackward0>)


In [28]:
# Saving Model for later use:
# Specify a path
PATH = "state_dict_model.pt"

# Save
torch.save(model.state_dict(), PATH)

# Evaluate the LSTM model

In [29]:
predicted = [predict(n, model) for n in test_data.name]

In [30]:
test_data['predicted'] = test_data.name.apply(lambda x: predict(x, model))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [31]:
test_data['predicted'] = test_data.name.apply(lambda x: predict(x, model))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [32]:
train_data['predicted'] = train_data.name.apply(lambda x: predict(x, model))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [33]:
train_data['predicted'] = train_data.name.apply(lambda x: predict(x, model))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [34]:
confusion_matrix(test_data.predicted, test_data.gender)

array([[2838,  141],
       [ 284, 2772]])

In [35]:
accuracy_score( test_data.predicted, test_data.gender )

0.9295774647887324

Demo:

In [36]:
predict('mahesh', model)

'm'

In [37]:
predict('riddhi', model)

'f'

In [38]:
predict('yash', model)

'm'

In [39]:
predict('gauri', model)

'f'

# Future Work:

- Remove the repeated names.
- Take only first name instead of the full name.
- Add more names.
- fix null values instead of removing them.
- Try other models.