In [205]:
import nltk
import os, sys, re, collections, string
from tqdm import tqdm
from operator import itemgetter as at
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
sys.path.append("../python")
from gensim.models import Word2Vec
import data
%matplotlib inline

In [206]:
from sklearn.feature_extraction import text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
X,y = data.stemmed()

In [207]:
X_train, X_test, y_train, y_test, fname_train, fname_test = train_test_split(X, y, data.listFiles(), test_size=0.3, random_state=20180301)

Vectorization
---

In [208]:
class AvgWordVectors:
    def __init__(self, weights={}):
        self.weights = weights
    def fit(self, w2v_file):
        self.wv=Word2Vec.load(w2v_file).wv
    def transform(self, docs):
        ret = []
        for doc in tqdm(docs):
            v = None
            for w in nltk.word_tokenize(doc):
                try:
                    if v is not None:
                        v+=self.weights.get(w, 1.0)*self.wv[w]
                        n+=self.weights.get(w, 1.0)
                    else:
                        v=self.weights.get(w, 1.0)*np.copy(self.wv[w])
                        n=self.weights.get(w, 1.0)
                except KeyError:
                    continue
            ret.append(v/n)
        return ret

In [209]:
## BOW sparse vectors
vectorizer = text.CountVectorizer()
vectorizer = text.TfidfVectorizer(max_features=1000, max_df=0.05)
vectorizer.fit(X_train)
## Word vectors:
#vectorizer = AvgWordVectors()
#vectorizer.fit("../data/w2v.pickle")
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)
X_test

<1329x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 30124 stored elements in Compressed Sparse Row format>

In [210]:
vectorizer.get_feature_names()

['aaa',
 'ab',
 'abandon',
 'abt',
 'abus',
 'acacia',
 'accredit',
 'accrual',
 'accuraci',
 'acquiror',
 'add',
 'addendum',
 'addresse',
 'adequaci',
 'adher',
 'adjudg',
 'admiss',
 'admit',
 'advertis',
 'advisori',
 'affidavit',
 'aforement',
 'aftertax',
 'agil',
 'agrinatur',
 'aid',
 'air',
 'akam',
 'alcohol',
 'allianc',
 'allot',
 'along',
 'alphabet',
 'alway',
 'am',
 'ambigu',
 'ameren',
 'ameripath',
 'amiworld',
 'amort',
 'amzg',
 'analysi',
 'angel',
 'annex',
 'answer',
 'antoni',
 'apart',
 'appendix',
 'appreci',
 'approxim',
 'arc',
 'arizona',
 'arrear',
 'arrow',
 'ascertain',
 'asid',
 'assent',
 'assignor',
 'atlanta',
 'atwil',
 'auditor',
 'austin',
 'authent',
 'authorship',
 'autom',
 'automot',
 'ave',
 'awarde',
 'back',
 'background',
 'bad',
 'baker',
 'ballot',
 'bancorp',
 'bancshar',
 'bar',
 'bargain',
 'basic',
 'bbt',
 'bdc',
 'beach',
 'becam',
 'beij',
 'beneath',
 'bequest',
 'berg',
 'bid',
 'bill',
 'biotim',
 'birth',
 'biweekli',
 'blackb

Fitting Logistic Regression
---
$$p(class|doc)=\frac{1}{1+e^{-(w_1f_1+w_2f_2+\dots+w_nf_n)}}$$
Where $f_i$ are the word frequencies, and $w_i$ are the learned weights

In [211]:
model = LogisticRegression()
model.fit(X_train, y_train)
yh_train = model.predict(X_train)
yh_test = model.predict(X_test)
print(classification_report(y_test, yh_test))

             precision    recall  f1-score   support

        SPA       0.90      0.68      0.78        41
     bylaws       0.96      0.81      0.88       148
     credit       1.00      0.74      0.85        43
 employment       0.91      0.92      0.92       437
        rra       1.00      0.88      0.94        51
        rsu       0.89      0.95      0.92       609

avg / total       0.91      0.91      0.91      1329



## Understanding the model

### Important words

In [212]:
for cls, coef in zip(model.classes_, model.coef_):
    weights = sorted(list(zip(vectorizer.get_feature_names(),coef)), key=at(1), reverse=True)
    print ("==============\nTop ten words for {c}\n-------------------".format(c=cls))
    print ("(+) POSITIVE: "+",".join([word for word, weight in weights][:10]))
    print ("(-) NEGATIVE: "+",".join([word for word, weight in weights][-10:]))

Top ten words for SPA
-------------------
(+) POSITIVE: seller,buyer,accredit,accuraci,zbb,issuer,finder,insolv,moratorium,sophist
(-) NEGATIVE: bancorp,uncertif,ceii,posteffect,appendix,nonstatutori,untru,nonforfeit,underwritten,borrow
Top ten words for bylaws
-------------------
(+) POSITIVE: redempt,inspector,disinterest,unanim,thereat,stolen,wind,ballot,pleasur,uncertif
(-) NEGATIVE: grossup,unrestrict,iso,nonstatutori,untru,appendix,david,borrow,nonforfeit,seller
Top ten words for credit
-------------------
(+) POSITIVE: borrow,guarantor,matur,rmb,promissori,et,worth,ameren,revolv,lend
(-) NEGATIVE: iso,taxrel,buyer,bancorp,nonstatutori,uncertif,appendix,virginia,pennsylvania,nonforfeit
Top ten words for employment
-------------------
(+) POSITIVE: nonrenew,mr,biweekli,thencurr,semimonthli,car,inkind,club,grossup,discrimin
(-) NEGATIVE: unissu,splitup,nonstatutori,unrestrict,uncertif,bookentri,seller,untru,borrow,nonforfeit
Top ten words for rra
-------------------
(+) POSITIVE: u

### Where were we wrong ?

In [213]:
errors = [(i, f, actual, predicted) for f, i, actual, predicted in zip(fname_test, range(len(X)), y_test, yh_test) if actual!=predicted]
pd.DataFrame(errors, columns=["index", "file_name", "actual", "predicted"]).set_index("index")

Unnamed: 0_level_0,file_name,actual,predicted
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,rra-38370.txt,rra,SPA
15,rsu-28510.txt,rsu,employment
33,SPA-00960.txt,SPA,rsu
46,employment-05320.txt,employment,rsu
58,bylaws-39990.txt,bylaws,rsu
69,SPA-00060.txt,SPA,rsu
77,rra-38950.txt,rra,rsu
81,rsu-33690.txt,rsu,employment
90,rsu-37200.txt,rsu,employment
121,rsu-34190.txt,rsu,employment


In [214]:
print (data.readFile("rsu-28360.txt"))

##### 

Exhibit 10.23(B) 

YAHOO! INC.  
1995 STOCK PLAN  
(AS AMENDED AND RESTATED JUNE 12, 2007)  


THIS RESTRICTED STOCK AWARD AGREEMENT, (the “Agreement”), dated as of ___, 2007 (the “Date of Grant”), is made by and between Yahoo! Inc., a Delaware corporation (the “Company”), and ___(the “Grantee”). 

WHEREAS, the Company has adopted the Yahoo! Inc. 1995 Stock Plan, as amended (the “Plan”), pursuant to which the Company may grant Restricted Stock; 

WHEREAS, the Company desires to grant to the Grantee the number of shares of Restricted Stock provided for herein; 

NOW, THEREFORE, in consideration of the recitals and the mutual agreements herein contained, the parties hereto agree as follows: 

Section 1. Grant of Restricted Stock Award

(a) Grant of Restricted Stock. The Company hereby grants to the Grantee ___shares of Restricted Stock (the “Award”) on the terms and conditions set forth in this Agreement and as otherwise provided in the Plan. 

(b) Incorporation of Plan; Capitali

Testing various models
---

In [215]:
#http://scikit-learn.org/stable/supervised_learning.html
#http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
#model = MultinomialNB()
model = LogisticRegression()
#model = SGDClassifier(loss="log")
#model = DecisionTreeClassifier()
#model = RandomForestClassifier()
#model = LinearSVC()
model.fit(X_train, y_train)
yh_train = model.predict(X_train)
yh_test = model.predict(X_test)
print ("Train Accuracy: {train}\nTest Accuracy: {test}".format(train=accuracy_score(y_train, yh_train),test=accuracy_score(y_test, yh_test)))

Train Accuracy: 0.9125242091672047
Test Accuracy: 0.90895410082769


In [216]:
print(classification_report(y_test, yh_test))

             precision    recall  f1-score   support

        SPA       0.90      0.68      0.78        41
     bylaws       0.96      0.81      0.88       148
     credit       1.00      0.74      0.85        43
 employment       0.91      0.92      0.92       437
        rra       1.00      0.88      0.94        51
        rsu       0.89      0.95      0.92       609

avg / total       0.91      0.91      0.91      1329

