In [2]:
import html
import pprint
import re
from html.parser import HTMLParser

In [22]:
class A(object):
    def __init__(self):
        self.x = 'Hello'
        self.y = 'Howdy'
        self.z = 'how'

    def method_a(self, foo):
        print self.x + ' ' + foo
    
    def method_b(self, foo, foo2):
        print foo + ', ' + foo2 + ' ' + self.z

In [23]:
asdf = A()
asdf.method_b('Silar', 'Saylor')

Silar, Saylor how


In [6]:
a = A() 
a.method_a('Sailor!')

Hello Sailor!


In [75]:
class ReutersParser(HTMLParser):
    """
    ReutersParser subclasses HTMLParser and is used to open the SGML
    files associated with the Reuters-21578 categorised test collection.

    The parser is a generator and will yield a single document at a time.
    Since the data will be chunked on parsing, it is necessary to keep 
    some internal state of when tags have been "entered" and "exited".
    Hence the in_body, in_topics and in_topic_d boolean members.
    """
    def __init__(self, encoding='latin-1'):
        """
        Initialise the superclass (HTMLParser) and reset the parser.
        Sets the encoding of the SGML files by default to latin-1.
        """
        html.parser.HTMLParser.__init__(self)
        self._reset()
        self.encoding = encoding

    def _reset(self):
        """
        This is called only on initialisation of the parser class
        and when a new topic-body tuple has been generated. It
        resets all off the state so that a new tuple can be subsequently
        generated.
        """
        self.in_body = False
        self.in_topics = False
        self.in_topic_d = False
        self.body = ""
        self.topics = []
        self.topic_d = ""

    def parse(self, fd):
        """
        parse accepts a file descriptor and loads the data in chunks
        in order to minimise memory usage. It then yields new documents
        as they are parsed.
        """
        self.docs = []
        for chunk in fd:
            self.feed(chunk.decode(self.encoding))
            for doc in self.docs:
                yield doc
            self.docs = []
        self.close()

    def handle_starttag(self, tag, attrs):
        """
        This method is used to determine what to do when the parser
        comes across a particular tag of type "tag". In this instance
        we simply set the internal state booleans to True if that particular
        tag has been found.
        """
        if tag == "reuters":
            pass
        elif tag == "body":
            self.in_body = True
        elif tag == "topics":
            self.in_topics = True
        elif tag == "d":
            self.in_topic_d = True 

    def handle_endtag(self, tag):
        """
        This method is used to determine what to do when the parser
        finishes with a particular tag of type "tag". 

        If the tag is a <REUTERS> tag, then we remove all 
        white-space with a regular expression and then append the 
        topic-body tuple.

        If the tag is a <BODY> or <TOPICS> tag then we simply set
        the internal state to False for these booleans, respectively.

        If the tag is a <D> tag (found within a <TOPICS> tag), then we
        append the particular topic to the "topics" list and 
        finally reset it.
        """
        if tag == "reuters":
            self.body = re.sub(r'\s+', r' ', self.body)
            self.docs.append( (self.topics, self.body) )
            self._reset()
        elif tag == "body":
            self.in_body = False
        elif tag == "topics":
            self.in_topics = False
        elif tag == "d":
            self.in_topic_d = False
            self.topics.append(self.topic_d)
            self.topic_d = ""  

    def handle_data(self, data):
        """
        The data is simply appended to the appropriate member state
        for that particular tag, up until the end closing tag appears.
        """
        if self.in_body:
            self.body += data
        elif self.in_topic_d:
            self.topic_d += data
            
def obtain_topic_tags():
    """
    Open the topic list file and import all of the topic names
    taking care to strip the trailing "\n" from each word.
    """
    topics = open(
        "data/all-topics-strings.lc.txt", "r"
    ).readlines()
    topics = [t.strip() for t in topics]
    return topics

def filter_doc_list_through_topics(topics, docs):
    """
    Reads all of the documents and creates a new list of two-tuples
    that contain a single feature entry and the body text, instead of
    a list of topics. It removes all geographic features and only 
    retains those documents which have at least one non-geographic
    topic.
    """
    ref_docs = []
    for d in docs:
        if d[0] == [] or d[0] == "":
            continue
        for t in d[0]:
            if t in topics:
                d_tup = (t, d[1])
                ref_docs.append(d_tup)
                break
    return ref_docs



if __name__ == "__main__":
    # Create the list of Reuters data and create the parser
    files = ["data/reut2-%03d.sgm" % r for r in range(0, 6)]
    parser = ReutersParser()

    # Parse the document and force all generated docs into
    # a list so that it can be printed out to the console
    docs = []
    for fn in files:
        for d in parser.parse(open(fn, 'rb')):
            docs.append(d)

    # Obtain the topic tags and filter docs through it 
    topics = obtain_topic_tags()
    ref_docs = filter_doc_list_through_topics(topics, docs)
        

In [76]:
import pandas as pd

df = pd.DataFrame(ref_docs)
df.head()



Unnamed: 0,0,1
0,cocoa,Showers continued throughout the week in the B...
1,grain,The U.S. Agriculture Department reported the f...
2,veg-oil,Argentine grain board figures show crop regist...
3,earn,Champion Products Inc said its board of direct...
4,acq,Computer Terminal Systems Inc said it has comp...


In [110]:
df[0].value_counts()

earn               1254
acq                 705
grain               179
crude               173
money-fx            153
trade               118
interest             77
money-supply         71
ship                 69
coffee               47
sugar                44
gold                 42
gnp                  38
veg-oil              30
cpi                  25
bop                  25
livestock            25
oilseed              24
reserves             21
jobs                 18
rubber               17
copper               15
ipi                  14
cocoa                13
carcass              12
nat-gas              11
housing              11
wpi                  11
retail               11
alum                 11
orange               10
cotton                8
iron-steel            8
meal-feed             8
lei                   7
pet-chem              6
zinc                  6
gas                   6
hog                   6
strategic-metal       5
tin                   5
heat            

In [102]:
df

Unnamed: 0,0,1
0,cocoa,Showers continued throughout the week in the B...
1,grain,The U.S. Agriculture Department reported the f...
2,veg-oil,Argentine grain board figures show crop regist...
3,earn,Champion Products Inc said its board of direct...
4,acq,Computer Terminal Systems Inc said it has comp...
5,earn,"Shr 34 cts vs 1.19 dlrs Net 807,000 vs 2,858,0..."
6,earn,"Ohio Mattress Co said its first quarter, endin..."
7,earn,Oper shr loss two cts vs profit seven cts Oper...
8,earn,Shr one dlr vs 73 cts Net 12.6 mln vs 15.8 mln...
9,earn,Dean Foods Co expects earnings for the fourth ...


In [91]:
1254.0/3373

0.37177586718055144

In [97]:
df[0].value_counts() == 1

11

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer


def create_tfidf_training_data(docs):
    """
    Creates a document corpus list (by stripping out the
    class labels), then applies the TF-IDF transform to this
    list. 

    The function returns both the class label vector (y) and 
    the corpus token/feature matrix (X).
    """
    # Create the training data class labels
    y = df[0]

    # Create the document corpus list
    corpus = df[1]

    # Create the TF-IDF vectoriser and transform the corpus
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(corpus)
    return X, y



X, y = create_tfidf_training_data(df)

In [78]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42
)

In [94]:
from sklearn.svm import SVC

def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = SVC(C=1000000.0, gamma='auto', kernel='rbf')
    svm.fit(X, y)
    return svm

svm = train_svm(X_train, y_train)

In [95]:
from sklearn.metrics import confusion_matrix

pred = svm.predict(X_test)

print(svm.score(X_test, y_test))
print(confusion_matrix(pred, y_test))

0.805925925926
[[137   0   0 ...,   0   1   1]
 [  0   1   0 ...,   0   0   0]
 [  0   0   0 ...,   0   0   0]
 ..., 
 [  0   0   0 ...,   3   0   0]
 [  0   0   0 ...,   0   0   0]
 [  0   0   0 ...,   0   0   0]]


In [109]:
(confusion_matrix(pred, y_test)).shape

(47, 47)