In [1]:
import pandas as pd
import string 
import nltk
import re 
import numpy as np
import warnings

pd.options.display.max_rows = 10

In [2]:
# have a look on the labels' file
mapping = pd.read_csv('data/Interview_Mapping.csv')
mapping

Unnamed: 0,Judgements,Area.of.Law
0,LNIND_1988_CAL_114,To be Tested
1,LNIND_1956_CAL_163,To be Tested
2,LNIND_1976_CAL_277,To be Tested
3,LNIND_1980_CAL_52,To be Tested
4,LNIND_1955_CAL_124,To be Tested
...,...,...
994,LNIND_1993_DEL_112,Criminal Laws
995,LNIND_1988_CAL_83,Service Law
996,LNIND_1993_DEL_16,Criminal Laws
997,LNIND_1957_CAL_46,Succession Laws


In [3]:
# create labels
unlabeled = []
labeled = []
labels = []

for index,row in mapping.iterrows():
    if row['Area.of.Law'] == 'To be Tested':
        unlabeled.append(row['Judgements'])
    else: 
        labeled.append(row['Judgements'])
        labels.append(row['Area.of.Law'])
        
# how much unique area of law        
print(len(set(labels)))

41


In [4]:
# load files
import os

unlabeled_text=[]
labeled_text=[]

for name in unlabeled:
    path = os.path.join('data/',name+'.txt')
    with open(path,'r',errors = 'ignore') as f:
        unlabeled_text.append(f.read())
for name in labeled:
    path = os.path.join('data/',name+'.txt')
    with open(path,'r',errors = 'ignore') as f:
        labeled_text.append(f.read())


In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stop = stopwords.words('english')
punct = string.punctuation

labeled_cleaned=[]
unlabeled_cleaned = []

for passage in labeled_text:
    # remove links 
    passage= re.sub(r'http(s)?:\/\/\S*', "", str(passage))
    # remove \n
    passage = ''.join([elem.replace('\n',' ') for elem in passage])
    # normalization and remove stopwords
    passage = ' '.join([elem for elem in passage.lower().split() if elem not in stop])
    #remove punctuation 
    passage = ''.join([elem.replace('[^\w\s]',' ') for elem in passage if elem not in punct])
    #remove digits
    passage = ''.join([elem for elem in passage if not elem.isdigit()])
    #lemmatization
    lemmatizer = WordNetLemmatizer()
    passage = ' '.join(lemmatizer.lemmatize(elem) for elem in passage.split())
    
    labeled_cleaned.append(passage)

    
for passage in unlabeled_text:
        
    # remove links 
    passage= re.sub(r'http(s)?:\/\/\S*', "", str(passage))
    # remove \n
    passage = ''.join([elem.replace('\n',' ') for elem in passage])
    # normalization and remove stopwords
    passage = ' '.join([elem for elem in passage.lower().split() if elem not in stop])
    #remove punctuation 
    passage = ''.join([elem.replace('[^\w\s]',' ') for elem in passage if elem not in punct])
    #remove digits
    passage = ''.join([elem for elem in passage if not elem.isdigit()])
    #lemmatization
    lemmatizer = WordNetLemmatizer()
    passage = ' '.join(lemmatizer.lemmatize(elem) for elem in passage.split())
    unlabeled_cleaned.append(passage)
    # it has to be a string so it could be processed later 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Flora\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Flora\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# split the labeled data into training and validation set 
# use 7-3 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

train_text,val_text,train_labels,val_labels =train_test_split(labeled_cleaned,labels,test_size = 0.25,random_state = 0)

# do tfidf to get X_train and X_val
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_text)
X_val = vectorizer.transform(val_text)

# do tfidf to get X_test (unlabeled text that needed to be predicted)
X_test = vectorizer.transform(unlabeled_cleaned) #transform on test set, not fit_transform

# do label encoding to get y_train and y_val
encoder = LabelEncoder()
encoder.fit(labels)
y_train = encoder.transform(train_labels)
y_val = encoder.transform(val_labels)
all_labels = encoder.classes_

In [10]:
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,classification_report

def get_metrics(y_val, y_predicted,yHat_train,y_train):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_val, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_val, y_predicted, pos_label=None,
                              average='weighted')
    # harmonic mean of precision and recall
    f1 = f1_score(y_val, y_predicted, pos_label=None, average='weighted')
    # true positives + true negatives/ total
    accuracyTest = accuracy_score(y_val, y_predicted)
    accuracyTrain = accuracy_score(y_train,yHat_train)
    return accuracyTest,accuracyTrain, precision, recall, f1


In [8]:
# 1st model: Naive Bayes
warnings.filterwarnings('ignore')
from sklearn import naive_bayes

def modelNB(X_train,y_train,X_val,X_test):
    modelNB = naive_bayes.MultinomialNB()
    modelNB.fit(X_train,y_train)
    predicted_labels_ = modelNB.predict(X_val)
    result_ = modelNB.predict(X_test)
    print(result_)
    yHat_train_ = modelNB.predict(X_train)
    return predicted_labels_,result_,yHat_train_

predicted_labels,result,yHat_train=modelNB(X_train,y_train,X_val,X_test)
accuracyTest,accuracyTrain, precision, recall, f1 = get_metrics(y_val, predicted_labels,yHat_train,y_train)
print("Test accuracy = %.3f, Train accuracy = %.3f,precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracyTest,accuracyTrain, precision, recall, f1))


[ 6  6  6  6  6  6  6  6 23  6 23  6 23 39  6  6  6 39  6  6 23 23  6  6
 39  6 23  6 39 23  6 39  6 23  6 14  6  6  6  6  6  6  6 23  6  6  6  6
 39  6  6 39 39  6  6  6  6  6  6  6 23  6  6  6  6  6  6  6  6  6  6 23
  6  6  6  6 23  6  6  6  6 23  6 39  6  6  6  6  6 23 39 23 23  6  6  6
 23  6  6  6]
Test accuracy = 0.329, Train accuracy = 0.359,precision = 0.250, recall = 0.329, f1 = 0.214


In [14]:
# 2nd model: Logistic Regression
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression

def modelLR(X_train,y_train,X_val,X_test):
    modelLR = LogisticRegression(C=3.0, class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', n_jobs=-1, random_state=0)
    modelLR.fit(X_train,y_train)
    predicted_labels_ = modelLR.predict(X_val)
    result_ = modelLR.predict(X_test)
    print(result_)
    yHat_train_ = modelLR.predict(X_train)
    return predicted_labels_,result_,yHat_train_

predicted_labels,result,yHat_train=modelLR(X_train,y_train,X_val,X_test)
accuracyTest,accuracyTrain, precision, recall, f1 = get_metrics(y_val, predicted_labels,yHat_train,y_train)
print("Test accuracy = %.3f, Train accuracy = %.3f,precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracyTest,accuracyTrain, precision, recall, f1))


[34 28  6  8 14  1  7 28 23  6 23 39 23 39  6 14 18 39 37 11 23 23 16  7
 39 25 23  8 39 36 36 39 34 23 34 14  2 11 28 15 16  6 13 36 34 34 14 38
 39 30 13 39 39 27  1 17  1 18  4 11 34 15 34 38 38 35 14 36  6  7 18 36
  6 11 37  8 23 13 20 36  6  7  1 39  5 39 34 37 36 23 39 23 23 14 27 16
 23 21 13  6]
Test accuracy = 0.631, Train accuracy = 0.917,precision = 0.641, recall = 0.631, f1 = 0.623


In [10]:
# 3nd model: SVM
# before applying SVMs,  standardize the data first.
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
from sklearn import decomposition,preprocessing
from sklearn.decomposition import PCA, TruncatedSVD

svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(X_train)
X_train_svd = svd.transform(X_train)
X_val_svd = svd.transform(X_val)
X_test_svd=svd.transform(X_test)
# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(X_train_svd)
X_train_svd_scl = scl.transform(X_train_svd)
X_val_svd_scl = scl.transform(X_val_svd)
X_test_svd_scl = scl.transform(X_test_svd)


In [11]:
# use SVM
import warnings
warnings.filterwarnings('ignore')

from sklearn import svm

def modelSVM(X_train,y_train,X_val,X_test):
    modelSVM = svm.SVC(C=1.0,probability=True)
    modelSVM.fit(X_train,y_train)
    predicted_labels_ = modelSVM.predict(X_val)
    result_ = modelSVM.predict(X_test)
    print(result_)
    yHat_train_ = modelSVM.predict(X_train)
    return predicted_labels_,result_,yHat_train_

predicted_labels,result,yHat_train=modelSVM(X_train,y_train,X_val,X_test)
accuracyTest,accuracyTrain, precision, recall, f1 = get_metrics(y_val, predicted_labels,yHat_train,y_train)
print("Before SVD: \nTest accuracy = %.3f, Train accuracy = %.3f,precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracyTest,accuracyTrain, precision, recall, f1))


predicted_labels_SVD,result_SVD,yHat_train_SVD =modelSVM(X_train_svd_scl,y_train,X_val_svd_scl,X_test_svd_scl)
accuracyTest_SVD,accuracyTrain_SVD, precision_SVD, recall_SVD, f1_SVD = get_metrics(y_val, predicted_labels_SVD,yHat_train_SVD,y_train)
print("After SVD: \nTest accuracy = %.3f, Train accuracy = %.3f,precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracyTest_SVD,accuracyTrain_SVD, precision_SVD, recall_SVD, f1_SVD))

# without truncatedSVD, the classifier was somehow underfit.
# with truncatedSVD, the accuracy was raised dramatically 

[6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
Before SVD: 
Test accuracy = 0.182, Train accuracy = 0.141,precision = 0.033, recall = 0.182, f1 = 0.056
[34  6  6 14  6  6  6 28 23  6 23 39 23 39  6 14  6 39 37  6 23 23  6  6
 39  6 23  6 39 36 36 39 34 23 34 14  1 23 14 15 16  6 13 36  6 34 14  6
 39 13 13 39 39  6  1 28  1  6  6  6  6  6 34 38 38  6  6 36  6  7  6 36
  6  6 37  6 23 13 23 36  6  7  1 39  6 39 34 37 36 23 39 23 23 13  6  6
 23 38 13  6]
After SVD: 
Test accuracy = 0.573, Train accuracy = 0.866,precision = 0.521, recall = 0.573, f1 = 0.513


In [42]:
# 4th model: XgBoost

import xgboost as xgb
def modelxgb(X_train,y_train,X_val,X_test):
    modelxgb = xgb.XGBClassifier(max_depth=6, n_estimators=200, colsample_bytree=0.5, 
                        subsample=0.5, nthread=10, learning_rate=0.01)
    modelxgb.fit(X_train,y_train)
    predicted_labels_ = modelxgb.predict(X_val)
    result_ = modelxgb.predict(X_test)
    print(result_)
    yHat_train_ = modelxgb.predict(X_train)
    return predicted_labels_,result_,yHat_train_

# see the result on tfidf data 
predicted_labels,result,yHat_train = modelxgb(X_train,y_train,X_val,X_test)
accuracyTest,accuracyTrain, precision, recall, f1 = get_metrics(y_val, predicted_labels,yHat_train,y_train)
print("1\nTest accuracy = %.3f, Train accuracy = %.3f,precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracyTest,accuracyTrain, precision, recall, f1))


[ 7 28  6  8 28  1 34 28 23  6 23 39 23 39  6 14 37 39 37 34 23 23 37  6
 39  6 23  7 39  8 36 39  6 23 34 14  1 34 14 15 37  6 13 36  6 39 14  6
 39 14 13 39 39  6  1  6  1 37  6  6 23 14 39 38 34 38 14 36 39 14 37 36
  6  6 37  6 23 13  8 14  6  7  1 39 36 39 34 37 36 23 39 23 23 14  6 16
 23 21 13  6]
1
Test accuracy = 0.613, Train accuracy = 0.929,precision = 0.552, recall = 0.613, f1 = 0.558


In [11]:
#try bag of word then tfidf

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def cv(data):
    count_vectorizer = CountVectorizer()
    emb = count_vectorizer.fit_transform(data)
    return emb, count_vectorizer

train_text,val_text,train_labels,val_labels = train_test_split(labeled_cleaned,labels,test_size = 0.25,random_state = 0)
X_train_count, count_vectorizer = cv(train_text)
X_val_count = count_vectorizer.transform(val_text)

# do tfidf to get X_test (unlabeled text that needed to be predicted)
X_test_count = count_vectorizer.transform(unlabeled_cleaned) #transform on test set, not fit_transform


In [12]:
#then do tfidf transformer to make training set and valid. set from occurences to freq.
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tf = tfidf_transformer.fit_transform(X_train_count)
X_val_tf = tfidf_transformer.transform(X_val_count)
X_test_tf = tfidf_transformer.transform(X_test_count)


In [15]:
# see the result in model LR
predicted_labels,result,yHat_train=modelLR(X_train_tf,y_train,X_val_tf,X_test_tf)
accuracyTest,accuracyTrain, precision, recall, f1 = get_metrics(y_val, predicted_labels,yHat_train,y_train)
print("Test accuracy = %.3f, Train accuracy = %.3f,precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracyTest,accuracyTrain, precision, recall, f1))


[34 28  6  8 14  1  7 28 23  6 23 39 23 39  6 14 18 39 37 11 23 23 16  7
 39 25 23  8 39 36 36 39 34 23 34 14  2 11 28 15 16  6 13 36 34 34 14 38
 39 30 13 39 39 27  1 17  1 18  4 11 34 15 34 38 38 35 14 36  6  7 18 36
  6 11 37  8 23 13 20 36  6  7  1 39  5 39 34 37 36 23 39 23 23 14 27 16
 23 21 13  6]
Test accuracy = 0.631, Train accuracy = 0.917,precision = 0.641, recall = 0.631, f1 = 0.623


In [None]:
# see the result in model XGBoost
predicted_labels,result,yHat_train=modelxgb(X_train_tf,y_train,X_val_tf,X_test_tf)
accuracyTest,accuracyTrain, precision, recall, f1 = get_metrics(y_val, predicted_labels,yHat_train,y_train)
print("Test accuracy = %.3f, Train accuracy = %.3f,precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracyTest,accuracyTrain, precision, recall, f1))


In [16]:
# so we setup a pipeline and do some grid search on LR

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
text_clf_LR = Pipeline([('vect', CountVectorizer()),
                     ('tfidf',TfidfTransformer()),
                     ('clf', LogisticRegression( random_state=0) ),
 ])

In [19]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__penalty': ('l1','l2'),
    'clf__C': (0.01,0.1,1,3,10,100),
    }

In [20]:
from pprint import pprint
from time import time
import logging


grid_search = GridSearchCV(text_clf_LR, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in text_clf_LR.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_text, train_labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()  
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
        


Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__C': (0.01, 0.1, 1, 3, 10, 100),
 'clf__penalty': ('l1', 'l2'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 24.4min finished


done in 1501.434s

Best score: 0.620
Best parameters set:
	clf__C: 100
	clf__penalty: 'l2'
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)


In [22]:
text_clf_LR = Pipeline([('vect', CountVectorizer(max_df=1.0,ngram_range=(1,2))),
                     ('tfidf',TfidfTransformer()),
                     ('clf', LogisticRegression(C=100,penalty='l2',random_state=0)),
 ])

In [44]:
text_clf_LR.fit(train_text, y_train)  
predicted_ed2 = text_clf_LR.predict(val_text)
np.mean(predicted_ed2 == y_val)    

# which doesn't improve alot

0.6222222222222222

In [48]:
# try grid search on xgboost
text_clf_xgb = Pipeline([('vect', CountVectorizer()),
                     ('tfidf',TfidfTransformer()),
                     ('clf', xgb.XGBClassifier(random_state=0) ),
 ])
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__max_depth': (3,9,15),
    'clf__alpha': (0,0.1,0.5,1),
    'clf__Eta':(0.01,0.015,0.05,0.1)
    }

In [None]:
grid_search = GridSearchCV(text_clf_xgb, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in text_clf_xgb.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_text, train_labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()  
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__Eta': (0.01, 0.015, 0.05, 0.1),
 'clf__alpha': (0, 0.1, 0.5, 1),
 'clf__max_depth': (3, 9, 15),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 288 candidates, totalling 864 fits


In [None]:
"""# write result in csv
with open('predictions.csv','w') as f:
    f.write('Judgements' + '\t' + 'Area of Law' + '\n')
    predictionList = all_labels[result]
    for i in range(0, len(result)):
        f.write(unlabeled[i] + '\t' + predictionList[i] + '\n')
        
sss = pd.read_csv('predictions.csv')
print(sss)"""