## Gender classification

In [1]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [3]:
from nltk.corpus import names
import random
import nltk
names = ([(name, 'male') for name in names.words('male.txt')] + [(name,'female') for name in names.words('female.txt')])
random.shuffle(names)

## Split the data into training and test set

In [4]:
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]


## Train using naive bayes classifier

In [7]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [9]:
classifier.classify(gender_features('Niketan'))

'male'

In [10]:
classifier.classify(gender_features('Maya'))

'female'

## Dialogue Act types

In [11]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [12]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features[f'contains{word.lower()}'] = True
    return features

In [13]:
# training and test set
featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.667

- Accuracy of 66.7 is still good

## Entropy

In [14]:
import math
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in freqdist]
    return -sum(p * math.log(p,2) for p in probs)

In [15]:
entropy(['male', 'male', 'male', 'male'])

-0.0

In [17]:
entropy(['male', 'female', 'male', 'male'])


0.8112781244591328