# Text Classification is very useful technique in text analysis, such as it can be used in spam filtering, language identification, sentiment analysis, genre classification

#### The most basic thing for a supervised text classifier is the labeled category data, which can be used as a training data. As an example, we use the NLTK Name corpus to train a Gender Identification classifier

In [1]:
from nltk.corpus import names

In [2]:
import random

In [3]:
names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

In [4]:
names

[('Aamir', 'male'),
 ('Aaron', 'male'),
 ('Abbey', 'male'),
 ('Abbie', 'male'),
 ('Abbot', 'male'),
 ('Abbott', 'male'),
 ('Abby', 'male'),
 ('Abdel', 'male'),
 ('Abdul', 'male'),
 ('Abdulkarim', 'male'),
 ('Abdullah', 'male'),
 ('Abe', 'male'),
 ('Abel', 'male'),
 ('Abelard', 'male'),
 ('Abner', 'male'),
 ('Abraham', 'male'),
 ('Abram', 'male'),
 ('Ace', 'male'),
 ('Adair', 'male'),
 ('Adam', 'male'),
 ('Adams', 'male'),
 ('Addie', 'male'),
 ('Adger', 'male'),
 ('Aditya', 'male'),
 ('Adlai', 'male'),
 ('Adnan', 'male'),
 ('Adolf', 'male'),
 ('Adolfo', 'male'),
 ('Adolph', 'male'),
 ('Adolphe', 'male'),
 ('Adolpho', 'male'),
 ('Adolphus', 'male'),
 ('Adrian', 'male'),
 ('Adrick', 'male'),
 ('Adrien', 'male'),
 ('Agamemnon', 'male'),
 ('Aguinaldo', 'male'),
 ('Aguste', 'male'),
 ('Agustin', 'male'),
 ('Aharon', 'male'),
 ('Ahmad', 'male'),
 ('Ahmed', 'male'),
 ('Ahmet', 'male'),
 ('Ajai', 'male'),
 ('Ajay', 'male'),
 ('Al', 'male'),
 ('Alaa', 'male'),
 ('Alain', 'male'),
 ('Alan', 'male

In [5]:
random.shuffle(names)

In [6]:
len(names)

7944

In [7]:
names[0:10]

[('Filip', 'male'),
 ('Maurits', 'male'),
 ('Gipsy', 'female'),
 ('Bennett', 'male'),
 ('Alston', 'male'),
 ('Trenna', 'female'),
 ('Mona', 'female'),
 ('Goldina', 'female'),
 ('Jock', 'male'),
 ('Jade', 'female')]

The dictionary that is returned by this function is called a feature set and maps from features’ names to their values. Feature set is core part for NLTK Classifier, we can use the feature extractor to extract feature sets for NLTK Classifier and segment them into training set and testing set:

In [9]:
def gender_features(word):
    return {'last_letter': word[-1]}

gender_features('Lohith')

{'last_letter': 'h'}

In [10]:
featuresets = [(gender_features(n), g) for (n, g) in names]

In [11]:
len(featuresets)

7944

In [12]:
featuresets[0:20]

[({'last_letter': 'p'}, 'male'),
 ({'last_letter': 's'}, 'male'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 't'}, 'male'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'k'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'l'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'l'}, 'male'),
 ({'last_letter': 's'}, 'female'),
 ({'last_letter': 'r'}, 'male'),
 ({'last_letter': 'a'}, 'female')]

In [13]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [14]:
len(train_set)

7444

In [15]:
len(test_set)

500

## A learning algorithm is very useful for a classifier, here we will show you how to use the Naive Bayes and Maximum Entropy Model to train a NaiveBayes and Maxent Classifier, where Naive Bayes is the Generative Model and Maxent is Discriminative Model.

#### Here is how to train a Naive Bayes classifier for Gender Identification:

In [16]:
from nltk import NaiveBayesClassifier

In [17]:
nb_classifier = NaiveBayesClassifier.train(train_set)

In [18]:
nb_classifier.classify(gender_features('Gary'))

'female'

In [19]:
nb_classifier.classify(gender_features('Grace'))

'female'

In [20]:
from nltk import classify

In [21]:
classify.accuracy(nb_classifier, test_set)

0.756

In [22]:
nb_classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     34.5 : 1.0
             last_letter = 'k'              male : female =     31.2 : 1.0
             last_letter = 'f'              male : female =     17.3 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
             last_letter = 'p'              male : female =     11.2 : 1.0


#### Here is how to train a Maximum Entropy Classifier for Gender Identification:

In [23]:
from nltk import MaxentClassifier

In [24]:
me_classifier = MaxentClassifier.train(train_set)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.371
             2          -0.37438        0.763
             3          -0.37396        0.763
             4          -0.37371        0.763
             5          -0.37354        0.763
             6          -0.37342        0.763
             7          -0.37332        0.763
             8          -0.37325        0.763
             9          -0.37320        0.763
            10          -0.37315        0.763
            11          -0.37311        0.763
            12          -0.37308        0.763
            13          -0.37305        0.763
            14          -0.37303        0.763
            15          -0.37301        0.763
            16          -0.37299        0.763
            17          -0.37297        0.763
            18          -0.37296        0.763
            19          -0.37294        0.763
 

In [25]:
me_classifier.classify(gender_features('Gary'))

'female'

In [26]:
me_classifier.classify(gender_features('Grace'))

'female'

In [27]:
classify.accuracy(me_classifier, test_set)

0.756

In [28]:
me_classifier.show_most_informative_features(5)

   6.644 last_letter==' ' and label is 'female'
   6.644 last_letter=='c' and label is 'male'
  -4.918 last_letter=='a' and label is 'male'
  -3.481 last_letter=='k' and label is 'female'
  -2.755 last_letter=='f' and label is 'female'


## It seems that Naive Bayes and Maxent Model have the same result on this Gender Task, but that’s not true. Choosing right features and deciding how to encode them for the task have an big impact on the performance. Here we define a more complex feature extractor function and train the model again: 

In [29]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [30]:
gender_features2('Gary')

{'count(a)': 1,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 1,
 'count(h)': 0,
 'count(i)': 0,
 'count(j)': 0,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 0,
 'count(o)': 0,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 1,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 1,
 'count(z)': 0,
 'firstletter': 'g',
 'has(a)': True,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': True,
 'has(h)': False,
 'has(i)': False,
 'has(j)': False,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': False,
 'has(o)': False,
 'has(p)': False,
 'has(q)': False,
 'has(r)': True,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': True,
 'has(z)': False,
 'lastletter': 'y'}

In [31]:
featuresets = [(gender_features2(n), g) for (n, g) in names]

In [32]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [33]:
nb2_classifier = NaiveBayesClassifier.train(train_set)

In [34]:
classify.accuracy(nb2_classifier, test_set)

0.792

In [35]:
me2_classifier = MaxentClassifier.train(train_set)  ### This takes a while

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.371
             2          -0.61247        0.629
             3          -0.59850        0.629
             4          -0.58534        0.630
             5          -0.57296        0.636
             6          -0.56133        0.651
             7          -0.55041        0.669
             8          -0.54017        0.687
             9          -0.53056        0.702
            10          -0.52153        0.713
            11          -0.51306        0.724
            12          -0.50509        0.735
            13          -0.49759        0.742
            14          -0.49054        0.749
            15          -0.48388        0.753
            16          -0.47760        0.757
            17          -0.47168        0.762
            18          -0.46607        0.764
            19          -0.46076        0.768
 

In [36]:
classify.accuracy(me2_classifier, test_set)

0.82

## It seems that more features make Maximum Entropy Model more accurate, but more slow when training it. We can define the third feature extractor function and train Naive Bayes and Maxent Classifier models again:

In [38]:
def gender_features3(name):
    features = {}
    features["fl"] = name[0].lower()  ### first letter
    features["ll"] = name[-1].lower()  ## last letter
    features["fw"] = name[:2].lower()  ## first word/first 2 letters
    features["lw"] = name[-2:].lower() ## last word with last 2 letters
    return features

In [39]:
gender_features3('Gary')

{'fl': 'g', 'fw': 'ga', 'll': 'y', 'lw': 'ry'}

In [40]:
gender_features3('G')

{'fl': 'g', 'fw': 'g', 'll': 'g', 'lw': 'g'}

In [41]:
featuresets = [(gender_features3(n), g) for (n, g) in names]

In [42]:
featuresets[0]

({'fl': 'f', 'fw': 'fi', 'll': 'p', 'lw': 'ip'}, 'male')

In [43]:
len(featuresets)

7944

In [44]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [45]:
nb3_classifier = NaiveBayesClassifier.train(train_set)

In [46]:
classify.accuracy(nb3_classifier, test_set)

0.83

In [47]:
me3_classifier = MaxentClassifier.train(train_set)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.371
             2          -0.40759        0.798
             3          -0.35145        0.817
             4          -0.32621        0.824
             5          -0.31201        0.826
             6          -0.30291        0.828
             7          -0.29655        0.831
             8          -0.29183        0.832
             9          -0.28818        0.832
            10          -0.28525        0.832
            11          -0.28285        0.832
            12          -0.28083        0.832
            13          -0.27911        0.833
            14          -0.27763        0.834
            15          -0.27634        0.834
            16          -0.27521        0.834
            17          -0.27420        0.834
            18          -0.27330        0.835
            19          -0.27250        0.835
 

In [48]:
classify.accuracy(me3_classifier, test_set)

0.826