In [1]:
import re
import random
import codecs
import string
import gensim
import unicodedata
import copy as cp
import numpy as np
%matplotlib inline
import matplotlib
import matplotlib.pyplot as pl
import mpl_toolkits.mplot3d.axes3d as p3
import sklearn.feature_selection as fs
from time import time
from gensim.models import doc2vec
from collections import namedtuple
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Binarizer
from sklearn.pipeline import Pipeline
from sklearn import naive_bayes
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.cross_validation import KFold, LeaveOneOut, StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.dummy import DummyClassifier
from nltk import word_tokenize          
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from scipy.sparse import vstack
from itertools import cycle
from sklearn.cluster import KMeans, AffinityPropagation, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD, RandomizedPCA, NMF
from sklearn.preprocessing import scale, Normalizer, Binarizer 
from sklearn.datasets.samples_generator import make_swiss_roll

In [2]:
all_ne = "juri-all-non-empty.csv"
all_e = "juri.csv"
train2_8 = "court_rulings_task2_8classes_train.csv"
test2_8 = "court_rulings_task2_8classes_test.csv"

In [3]:
stop = stopwords.words('french')
stop.append(u'dun')
stop.append(u'dune')
stop.append(u'les')

In [4]:
class LemmaTokenizer(object):
    def __init__(self):
        self.stemmer = SnowballStemmer("french")

    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]


def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii


def get_preprocessor(suffix=''):
    def preprocess(unicode_text):
        return unicode_text.strip().lower() + suffix
    return preprocess

def preprocess_data(X, n, suffix='', binarize=True):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1),
                                 preprocessor=get_preprocessor(suffix), tokenizer=LemmaTokenizer())
    X = vectorizer.fit_transform(X)
    X = Binarizer(copy=False).fit_transform(X) if binarize else X
    return X

In [5]:
def load_data_all(filename=all_e):
    text, rule, area, date, claw = [], [], [], [], []
    with codecs.open(filename, 'r', encoding="utf-8") as f:
        for line in f:
            a = line.split("\t")
            if (line != "") and (len(a)==7):
                idx, loc, dec, dt, dsc, art, law  = line.split("\t") 
                if (loc != "") and (dsc != "") and (dec!="") and (dt!="") and (law!=""):
                    area.append(loc)
                    rule.append(dec)
                    d = str(int(dt.split("-")[0])/10)
                    date.append(d)
                    text.append(dsc)
                    claw.append(law)
                
    print len(rule), len(text)
#     text = np.array(text)
#     rule = np.array(rule)
    
#     rule = reduce_classes(rule)
    return text, area, date, rule, claw

def load_data(fname):
    text, label = [], []
    with codecs.open(fname, 'r', encoding="utf-8") as f:
        for line in f:
            a = line.split("\t")
            if (line != "") and (len(a)==2):
                text.append(a[0])
                label.append(a[1])
#     text, label = np.array(text), np.array(label)
    return text, label

In [6]:
Xa, ya = load_data("court_rulings_task1_8classes_test.csv")
Xr, yr = load_data("court_rulings_task2_8classes_test.csv")

In [7]:
def train_d2v(fin, fout):
    print "entered d2v training"
    sentences = doc2vec.TaggedLineDocument(fin)
    model_court = gensim.models.Doc2Vec(sentences, size=200, workers =10, window=20)

    model_court.save(fout)
    
def train_d2v2(fin, fout):
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    docs = []
    with codecs.open(fin, encoding='utf-8') as f:
        for line_no, line in enumerate(f):
            doc = gensim.utils.to_unicode(line).split()
            words = doc[:-1]
            tags = [line_no]
            docs.append(analyzedDocument(words, tags))
    print "len(docs)", len(docs)
            
    model = doc2vec.Doc2Vec(docs, size = 200, window = 50, min_count = 1, workers =10)
    model.save(fout)
    
    

In [8]:
# train_d2v2("court_rulings_task2_8classes_train.csv", "court-task2-8_train.d2v2")

In [9]:
# train_d2v2("court_rulings_task1_8classes_test.csv", "court-task1-8_test.d2v2")

In [10]:
model = doc2vec.Doc2Vec.load("court-task1-8_test.d2v2")
model2 = doc2vec.Doc2Vec.load("court-task2-8_test.d2v2")


In [11]:
def extract_features(words, n, count=True, reduced=True, n_labels=8):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, n), binary=count, preprocessor=get_preprocessor())
    transformed_words = vectorizer.fit_transform(words)
#     transformed_words = np.array(transformed_words, dtype=np.float)
    
    if reduced:
        svd = TruncatedSVD(n_labels)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        reduced_X = lsa.fit_transform(transformed_words)
        return reduced_X, svd
    else:
        return transformed_words
    
def get_docvecs(vecs):
    np_vecs = np.zeros((len(vecs), len(vecs[0])), dtype=vecs[0].dtype)
    for i in range(len(vecs)):
        np_vecs[i] = vecs[i]
    return np_vecs

In [25]:
def affinity(X, labels, extract=False):
    if extract == True:
        print "Extracting features..."
        X, _ = extract_features(articles, 1, False)
    X_norms = np.sum(X * X, axis=1)
    S = -X_norms[:, np.newaxis] - X_norms[np.newaxis, :] + 2 * np.dot(X, X.T)
    p = 10 * np.median(S)

    print "Fitting affinity propagation clustering with unknown no of clusters..."
    af = AffinityPropagation().fit(S, p)
    indices = af.cluster_centers_indices_
#     for i, idx in enumerate(indices):
#         print i, articles[idx].encode("utf8")

    n_clusters_ = len(indices)

    print "Fitting PCA..."
    X = RandomizedPCA(2).fit(X).transform(X)    
    
    print "Plotting..."
    pl.figure(1)
    pl.clf()
    
    colors = cycle('bgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        class_members = af.labels_ == k
        cluster_center = X[indices[k]]
        pl.plot(X[class_members,0], X[class_members,1], col+'.')
        pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                                         markeredgecolor='k', markersize=14)
        for x in X[class_members]:
            pl.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) 

    pl.title('Estimated number of clusters: %d' % n_clusters_)
    pl.show()
    pl.savefig("affinity_cluster.png")

In [26]:
# X_new, _ = extract_features(X, 1, False, True, 8)
# affinity(X_new, y)

In [12]:
np_docvecs = get_docvecs(model.docvecs)
# affinity(np_docvecs, y)

In [13]:
len(model.docvecs), len(model2.docvecs)

(63214, 61362)

In [16]:
type(model.docvecs), type(np_docvecs)

(gensim.models.doc2vec.DocvecsArray, numpy.ndarray)

In [30]:
# Xa[500]

In [31]:
# maxim = 0
# for i in range(model.docvecs.count):
#     aux = model.docvecs.most_similar(i, topn=1)[0][1]
#     if maxim < aux:
#         maxim = aux
# maxim

In [32]:
a = model.docvecs.most_similar(30, topn=20)
r = model.docvecs.most_similar(30, topn=20)

In [45]:
a = []
print type(model.docvecs[0])
for i in range(len(model.docvecs)):
    a.append(model.docvecs[i])
    
print type(

<type 'numpy.ndarray'>
<type 'list'>


In [47]:
type(model.docvecs), type(model.docvecs[0])

(gensim.models.doc2vec.DocvecsArray, numpy.ndarray)

In [33]:
print "label of target document", ya[30]
print "-----------------"
print "label of top n most similar documents:"
print "-----------------"
for i in a:
    print ya[i[0]]

label of target document CHAMBRE_CIVILE_1

-----------------
label of top n most similar documents:
-----------------
CHAMBRE_CIVILE_3

CHAMBRE_CIVILE_1

CHAMBRE_CIVILE_1

CHAMBRE_SOCIALE

CHAMBRE_CIVILE_1

CHAMBRE_CIVILE_3

CHAMBRE_COMMERCIALE

CHAMBRE_SOCIALE

CHAMBRE_CIVILE_3

CHAMBRE_SOCIALE

CHAMBRE_CIVILE_1

CHAMBRE_SOCIALE

CHAMBRE_COMMERCIALE

CHAMBRE_SOCIALE

CHAMBRE_CIVILE_3

CHAMBRE_SOCIALE

CHAMBRE_CIVILE_1

CHAMBRE_CIVILE_2

CHAMBRE_CIVILE_1

CHAMBRE_SOCIALE



--------------------------

Cross val

In [17]:

skf = StratifiedKFold(y, n_folds=10)

NameError: name 'y' is not defined