In [1]:
%pylab inline
import xml.sax.saxutils as saxutils
import re
import pandas as pd
from bs4 import BeautifulSoup
import codecs


Populating the interactive namespace from numpy and matplotlib


In [2]:
#General constants 

# Newsline folder and format
data_folder = '/Users/xiuli/Downloads/reuters21578/' 

sgml_number_of_files = 22
sgml_file_name_template = 'reut2-NNN.sgm'


In [3]:
def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)


### Parse SGML files and Iterate all files to dataframe

In [4]:
document_X = {}
document_Y = {}

document_id=[]
document_body=[]
document_train_test=[]
category=[]
document_X=pd.DataFrame()

for i in range(sgml_number_of_files):
    if i < 10:
        seq = '00' + str(i)
    else:
        seq = '0' + str(i)
        
    file_name = sgml_file_name_template.replace('NNN', seq)
    #print('Reading file: %s' % file_name)
    
    with codecs.open(data_folder + file_name, 'r', encoding='utf-8',
                 errors='ignore') as file:
        content = BeautifulSoup(file.read().lower())      
        for newsline in content('reuters'):
                      
            # News-line Id
            document_id.append(newsline['newid'])
            
            # News-line text
            document_body.append(unescape(strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')))
#            document_body.append(newsline.find("title").text)
            document_train_test.append(strip_tags(str(newsline.attrs["lewissplit"]) ))
            #for c in newsline.find("topics").findAll('d'):
                #category.append(c.text)
            category.append([item.text for item in newsline.find("topics").findAll('d')])
            
        

document_X['document_id'] = document_id
document_X['document_body'] = document_body
document_X['train_test_label'] = document_train_test
document_X['category'] = category

document_X.head()

Unnamed: 0,document_id,document_body,train_test_label,category
0,1,showers continued throughout the week in\nthe ...,train,[cocoa]
1,2,standard oil co and bp north america\ninc said...,train,[]
2,3,texas commerce bancshares inc's texas\ncommerc...,train,[]
3,4,bankamerica corp is not under\npressure to act...,train,[]
4,5,the u.s. agriculture department\nreported the ...,train,"[grain, wheat, corn, barley, oat, sorghum]"


### Filter null, Train/Test data split

In [5]:
train_X=document_X[(document_X['train_test_label']=='train') & (document_X['category'].astype(str)!='[]') & (document_X['document_body'].astype(str)!='None')]['document_body'].values.tolist()
test_X=document_X[(document_X['train_test_label']=='test') & (document_X['category'].astype(str)!='[]')& (document_X['document_body'].astype(str)!='None')]['document_body'].values.tolist()

train_Y=document_X[(document_X['train_test_label']=='train') & (document_X['category'].astype(str)!='[]')& (document_X['document_body'].astype(str)!='None')]['category'].values.tolist()
test_Y=document_X[(document_X['train_test_label']=='test') & (document_X['category'].astype(str)!='[]')& (document_X['document_body'].astype(str)!='None')]['category'].values.tolist()


### Prepare for tokenization

In [6]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
 
cachedStopWords = stopwords.words("english")
 
def tokenize(text):
  min_length = 3
  words = map(lambda word: word.lower(), word_tokenize(text))
  words = [word for word in words if word not in cachedStopWords]
  tokens = (list(map(lambda token: PorterStemmer().stem(token),
                                   words)))
  p = re.compile('[a-zA-Z]+');
  filtered_tokens =list(filter (lambda token: p.match(token) and
                               len(token) >= min_length,
                               tokens))
  return filtered_tokens

### Multiclass/multilabel targets - Tokenize, Vectorize (Tf-idf) and Model training (SVM) 

In [7]:
from nltk.corpus import stopwords, reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

stop_words = stopwords.words("english")
 
# Tokenisation
vectorizer = TfidfVectorizer(stop_words=stop_words,
                             tokenizer=tokenize)
 
# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_X)
vectorised_test_documents = vectorizer.transform(test_X)
 
# Transform multilabel labels
mlb = MultiLabelBinarizer()

train_labels = mlb.fit_transform(train_Y)
test_labels = mlb.transform(test_Y)

# Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(vectorised_train_documents, train_labels)
 
predictions = classifier.predict(vectorised_test_documents)

  .format(sorted(unknown, key=str)))


### Model Evaluation

In [8]:
from sklearn.metrics import f1_score,precision_score,recall_score
 
precision = precision_score(test_labels, predictions,average='micro')
recall = recall_score(test_labels, predictions,average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions,
                            average='macro')
recall = recall_score(test_labels, predictions,
                      average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Micro-average quality numbers
Precision: 0.9416, Recall: 0.7838, F1-measure: 0.8555
Macro-average quality numbers
Precision: 0.4775, Recall: 0.2697, F1-measure: 0.3293


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)


### Please ignore below

### Compare test results visually 

In [9]:
pred_inverse=mlb.inverse_transform(predictions)
compare=pd.DataFrame()
true=[] 
pred=[]
for i in range(len(predictions)):
    true.append(test_Y[i])
    pred.append(pred_inverse[i])

compare['true']=true
compare['pred']=pred
print(compare)

                                                   true  \
0                                               [trade]   
1                                               [grain]   
2                                      [crude, nat-gas]   
3        [trade, grain, rice, corn, sugar, tin, rubber]   
4                                   [veg-oil, palm-oil]   
5                                                [ship]   
6           [veg-oil, palm-oil, lumber, coffee, rubber]   
7                                        [grain, wheat]   
8                                                [gold]   
9                                                 [acq]   
10                                                [tin]   
11                                 [money-fx, interest]   
12                                        [acq, copper]   
13                                                [ipi]   
14    [trade, rice, livestock, carcass, grain, corn,...   
15                                               [earn] 

In [22]:
print('"Example with pre-processor1:"')
print(train_X[1])

print('\n')
print('\n')

print('"Example with pre-processor2:"')
print(' u  agriculture department reported  farmer owned reserve national five day average price  february    follows dlrs bu sorghum cwt   natl loan release call avge rate x level price price wheat         iv       v       vi       corn         iv         v       x     rates natl loan release call avge rate x level price price oats         v       barley n      iv         v       sorghum         iv         v       reserves  ii  iii  matured level iv reflects grain entered  oct      feedgrain   july      wheat level v wheat barley        corn sorghum        level vi covers wheat entered  january     x  rates  dlrs per cwt   lbs n    available reuter')

"Example with pre-processor1:"
the u.s. agriculture department
reported the farmer-owned reserve national five-day average
price through february 25 as follows (dlrs/bu-sorghum cwt) -
         natl   loan           release   call
         avge   rate-x  level    price  price
 wheat   2.55   2.40       iv     4.65     --
                            v     4.65     --
                           vi     4.45     --
 corn    1.35   1.92       iv     3.15   3.15
                            v     3.25     --
 x - 1986 rates.

          natl   loan          release   call
          avge   rate-x  level   price  price
 oats     1.24   0.99        v    1.65    -- 
 barley   n.a.   1.56       iv    2.55   2.55
                             v    2.65    -- 
 sorghum  2.34   3.25-y     iv    5.36   5.36
                             v    5.54    -- 
    reserves i, ii and iii have matured. level iv reflects
grain entered after oct 6, 1981 for feedgrain and after july
23, 1981 for wheat. level v wheat/

In [23]:
print('"Example with pre-processor1:"')
print(train_X[0])

print('\n')
print('\n')

print('"Example with pre-processor2:"')
print('showers continued throughout  week   bahia cocoa zone alleviating  drought since early january  improving prospects   coming temporao although normal humidity levels    restored comissaria smith said   weekly review  dry period means  temporao   late  year arrivals   week ended february        bags    kilos making  cumulative total   season      mln         stage last year   seems  cocoa delivered earlier  consignment  included   arrivals figures comissaria smith said   still  doubt    much old crop cocoa  still available  harvesting  practically come   end  total bahia crop estimates around     mln bags  sales standing  almost     mln     hundred thousand bags still   hands  farmers middlemen exporters  processors   doubts    much   cocoa would  fit  export  shippers   experiencing dificulties  obtaining  bahia superior  certificates  view   lower quality  recent weeks farmers  sold  good part   cocoa held  consignment comissaria smith said spot bean prices rose       cruzados per arroba    kilos bean shippers  reluctant  offer nearby shipment   limited sales  booked  march shipment           dlrs per tonne  ports   named new crop sales  also light    open ports  june july going           dlrs        dlrs  new york july aug sept               dlrs per tonne fob routine sales  butter  made march april sold               dlrs april may butter went      times new york may june july           dlrs aug sept           dlrs            times new york sept  oct dec      dlrs      times new york dec comissaria smith said destinations   u  covertible currency areas uruguay  open ports cake sales  registered       dlrs  march april   dlrs  may   dlrs  aug      times new york dec  oct dec buyers   u  argentina uruguay  convertible currency areas liquor sales  limited  march april selling           dlrs june july      dlrs       times new york july aug sept      dlrs       times new york sept  oct dec      times new york dec comissaria smith said total bahia sales  currently estimated      mln bags       crop      mln bags       crop final figures   period  february    expected   published   brazilian cocoa trade commission  carnival  ends midday  february   reuter')

"Example with pre-processor1:"
showers continued throughout the week in
the bahia cocoa zone, alleviating the drought since early
january and improving prospects for the coming temporao,
although normal humidity levels have not been restored,
comissaria smith said in its weekly review.
    the dry period means the temporao will be late this year.
    arrivals for the week ended february 22 were 155,221 bags
of 60 kilos making a cumulative total for the season of 5.93
mln against 5.81 at the same stage last year. again it seems
that cocoa delivered earlier on consignment was included in the
arrivals figures.
    comissaria smith said there is still some doubt as to how
much old crop cocoa is still available as harvesting has
practically come to an end. with total bahia crop estimates
around 6.4 mln bags and sales standing at almost 6.2 mln there
are a few hundred thousand bags still in the hands of farmers,
middlemen, exporters and processors.
    there are doubts as to how much of this

In [30]:
document_body[-1:]

["the american stock exchange said it has\nintroduced options with expirations of up to three years on the\ninstitutional index.\n    with the ticker symbol <xii>, the index is a guage of the\ncore equity holdings of the nation's largest institutions, the\nexchange explained.\n    the new listings represent the first long-term options to\nbe traded by the amex, it added.\n    it said the long-term institutional index options began\ntrading monday with expirations of december 1988 <xiv> and\ndecember 1989 <xix>.\n   \n    the amex said a third long-term option with an expiration\nof december 1990 will begin trading following the december 1987\nexpiration.\n    it said strike prices on the long-term options have been\nset at 50 point intervals with initial strikes of 250, 300 and\n350. to avoid conflicting strike price codes, the 350 stike\nprices will carry the ticker symbols <xvv> for the option\nexpiring in december 1988 and <xvx> for the option expiring in\ndecember 1989.\n reuter\n\

In [28]:
newsline('text')[0].body

<body>the american stock exchange said it has
introduced options with expirations of up to three years on the
institutional index.
    with the ticker symbol &lt;xii&gt;, the index is a guage of the
core equity holdings of the nation's largest institutions, the
exchange explained.
    the new listings represent the first long-term options to
be traded by the amex, it added.
    it said the long-term institutional index options began
trading monday with expirations of december 1988 &lt;xiv&gt; and
december 1989 &lt;xix&gt;.
   
    the amex said a third long-term option with an expiration
of december 1990 will begin trading following the december 1987
expiration.
    it said strike prices on the long-term options have been
set at 50 point intervals with initial strikes of 250, 300 and
350. to avoid conflicting strike price codes, the 350 stike
prices will carry the ticker symbols &lt;xvv&gt; for the option
expiring in december 1988 and &lt;xvx&gt; for the option expiring in
december 198

In [31]:
train_X[-1:]

["australian trade unions said they have\nlaunched week-long strikes and other industrial action in new\nsouth wales (nsw) to protest against new laws that would reduce\ninjury compensation payments.\n    union sources said talks with the state government broke\ndown last night, but the two sides are scheduled to meet later\ntoday in an attempt to find a compromise.\n    rail freight and shipping cargo movements in the country's\nmost populous state were the first to be affected, and union\nofficials said almost every business sector will be hit unless\nthere is a quick settlement.\n    the state government recently introduced a new workers'\ncompensation act which would cut the cash benefits to injured\nworkers by up to a third. the act is now awaiting parliamentary\nratification.\n    nsw state premier barrie unsworth has said workers'\ncompensation has risen steeply in recent years and the proposed\ncuts would save hundreds of mlns of dollars a year.\n    union officials said indust

In [32]:
len(train_X)

7068

In [33]:
len(test_X)

2745