### Legal Text Summarization

#### Import libraries/packages

In [27]:
import os
import shutil
import textract
import numpy as np

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import corpus, pos_tag 
import gensim 
import enchant
import re
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import logging
import pprint

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#### Configure pretty print

In [8]:
pp = pprint.PrettyPrinter( indent = 4, width = 150 )

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

#### Get the english dictionary words

In [9]:
dictionary = enchant.Dict( "en_US" )

#### Define all required methods

In [45]:
def get_files_in_path( path ) :
    
    for root, dirs, files in os.walk( path ) :
        
        for filename in files :
            
            yield root + "/" + filename
            
            
def load_text_from_file( filename ) :
    
    contents = textract.process( filename, encoding='ascii' )
    contents = re.sub( '\\n',' ', str( contents ) )
    
    return contents


def delete_dir_contents( path ) :
    
    shutil.rmtree( path )
    
    os.makedirs( path )
    
    
def number_of_files_in_path( path ) :
    
    return len( next( os.walk( path ) )[ 2 ] )


def get_file_extension( filename ) :
    
    return os.path.splitext( filename )[ 1 ]


def get_file_basename( filename ) :
    
    return os.path.basename( filename )


def is_supported_file_type( filename ) :
    
    # A list of all supported extensions
    valid_extensions = [ ".doc", ".docx", ".pdf", ".rtf" ]
    
    return valid_extensions.count( get_file_extension( filename ) ) == 1


def generate_next_filename( path, extension ) :
    
    return path + "/" + str( number_of_files_in_path( path ) + 1 ) + extension


def write_data_to_file( filename, data ) :
    
    # Open the file for writing
    file = open( filename, 'w' )
    
    # Write the data to the file
    file.write( data )
    
    # Close the file
    file.close()
    
    
def generate_raw_files( source_dir, target_dir ) :
    
    # Clear all existing raw cases
    delete_dir_contents( target_dir + "/cases" )

    # Clear all existing raw summaries
    delete_dir_contents( target_dir + "/summaries" )
    
    for filename in get_files_in_path( source_dir ) :
    
        # Get the file extension
        extension = get_file_extension( filename )

        if is_supported_file_type( filename ) :

            # Load the file contents
            contents = load_text_from_file( filename )

            # Generate a new filename based on a numerical sequence
            new_filename = generate_next_filename( target_dir + "/" + ( "summaries" if "1.0" in filename else "cases" ), ".txt" )

            # Write the new filename
            write_data_to_file( new_filename, contents )
    
    
def tokenize_file_contents( filename ) :
    
    # Load the stop words
    stop_words = corpus.stopwords.words('english')
    
    # Load the file contents
    contents = load_text_from_file( filename )
    
    # Define the filename tokens
    content_tokens = []
    
    # Tokenize the sentences from the file contents
    sentence_tokens = sent_tokenize( contents )
    
    for sentence in sentence_tokens :
        
        # Tokenize the words from the sentence
        word_tokens = word_tokenize( sentence )
        
        # Remove the punctuation and convert the words to lowercase
        words = [ word.lower() for word in word_tokens if word.isalpha() and dictionary.check( word ) ]
        
        # Remove the stop words
        words = [ word for word in words if not word in stop_words ]
        
        # Append the sentence and tokens to the list
        content_tokens.append( { 'sentence': sentence, 'words': words } )
        
    return content_tokens


def load_files_in_path( path ) :
    
    # Create a list of tokens
    contents = []
    
    for filename in get_files_in_path( path ) :
        
        # Append a dictionary to the contents
        contents.append( load_text_from_file( filename ) )
        
    return contents



def tokenize_files_in_path( path ) :
    
    # Create a list of tokens
    tokens = []
    
    for filename in get_files_in_path( path ) :
        
        # Append a dictionary to the tokens
        tokens.append( { "filename": get_file_basename( filename ), "tokens": tokenize_file_contents( filename ) } )
        
    return tokens


def create_word2vec_model( entries ) :
    
    # Create documents list
    documents = []
    
    for entry in entries :
        
        # Create a file token
        file_tokens = []
        
        for sentence in entry[ "tokens" ] :
            
            # Add the word tokens in the document
            file_tokens =  file_tokens + sentence[ "words" ]
            
        # Add the file tokens to the document
        documents.append( file_tokens )
    
    model = gensim.models.Word2Vec ( documents, size = 50, window = 5, min_count = 1, workers = 100 )
    
    model.train( documents, total_examples = len( documents ), epochs = 10 )
    
    return model


def get_synonyms( model, word, min_similarity = 0.85 ) :
    
    # Define the similar words
    synonyms = []
    
    # Get related word2vec words
    suggested_words = model.wv.most_similar( positive = word, topn = 5 )
    
    # Limit words to a given similarity
    for word in suggested_words :
        
        if word[ 1 ] > min_similarity :
        
            # Add word to 
            synonyms.append( word[ 0 ] )
            
    return synonyms


def get_tfidf( file_entries ) :
    
    corpus = []
    
    for file_index, file_entry in enumerate( file_entries ) :
        
        document = []
        
        for token_index, token in enumerate( file_entry[ 'tokens' ] ) :
            
            document.append( token[ 'sentence' ] )
            
        corpus.append( " ".join( document ) )
    
    tf = TfidfVectorizer( smooth_idf = False, sublinear_tf = False, norm = None, analyzer = 'word' )
    
    tfidf = ( tf, tf.fit( corpus ) )
    
    return dict( zip( tfidf[ 1 ].get_feature_names(), tfidf[ 0 ].idf_ ) )


def generate_score( weight, sentence_position, total_sentences, sentence_length, sentence_length_mean, sentence_length_std ) :
    
    position_factor = math.sin( math.radians( ( float( sentence_position ) / total_sentences ) * 90 ) )
    
    length_factor = sentence_length_mean / np.square( sentence_length  - ( sentence_length_std * 2 ) )

    return position_factor * length_factor * weight


def score_tokenized_sentences( file_entries, word2vec_model ):
    
    tfidf = get_tfidf( file_entries )
    
    # Iterate through the files
    for file_index, file_entry in enumerate( file_entries ) :
        
        #if file_index < 10 :

            # Define the cleaned sentences
            cleaned_sentences = []
            sentences_lengths = []

            # Iterate through the tokens
            for token_index, token in enumerate( file_entry['tokens' ] ):

                # Add the words to the cleaned sentences
                cleaned_sentences.append( " ".join( str( word ) for word in token['words'] ) )
                sentences_lengths.append( len( token[ 'words' ] ) )

            for token_index, cleaned_sentence in enumerate( cleaned_sentences ) :
                
                # Words
                words = file_entries[ file_index ][ 'tokens' ][ token_index ][ 'words' ]

                # Define the sentence score
                sentence_score = 0

                # Iterate through the bag of words
                for word_index, word in enumerate( words ) :

                    if tfidf.has_key( word ) :
                        
                        # Increment the sentence score by the score
                        sentence_score = sentence_score + tfidf[ word ]

                # Get the sentence tags
                pos_tags = pos_tag( file_entries[ file_index ][ 'tokens' ][ token_index ][ 'words' ] )

                # Define the tags
                tags = []

                for tag in enumerate( pos_tags ) :

                    tags.append( tag[ 1 ][ 1 ] )

                weight = 0 if len( words ) < 1 else float( sentence_score ) / len( words )
                total_sentences = len( cleaned_sentences )
                sentence_position = token_index + 1
                sentence_length = len( file_entries[ file_index ][ 'tokens' ][ token_index ][ 'words' ] )
                sentence_length_mean = np.mean( sentences_lengths )
                sentence_length_std = np.std( sentences_lengths )

                # Create the store dictionary
                score = { 
                    'weight': weight, 
                    'total_sentences': total_sentences, 
                    'sentence_position': sentence_position, 
                    'tense': 'past' if 'VBD' in tags else 'present' if 'VBG' in tags else 'uncategorized',
                    'sentence': file_entries[ file_index ][ 'tokens' ][ token_index ][ 'sentence' ],
                    'rating': generate_score( weight, sentence_position, total_sentences, sentence_length, sentence_length_mean, sentence_length_std ),
                    'sentence_length': sentence_length
                }

                file_entries[ file_index ][ "tokens" ][ token_index ][ "score" ] = score
                

    return file_entries


def get_tokens_minimum_ratings( scored_tokens, number_of_sentences = 5 ) :
    
    token_ratings = []
    
    for entry_index, scored_token in enumerate( scored_tokens ) :
        
        ratings = []
        
        for sentence_index, sentence in enumerate( scored_token[ "tokens" ] ) :
            
            if sentence.has_key( "score" ) :
            
                ratings.append( sentence[ "score" ][ "rating" ] )
                
            else :
                
                ratings.append( 0.0 )
            
        total_ratings = len( ratings )
        
        index = ( number_of_sentences if number_of_sentences < total_ratings else total_ratings ) - 1
        
        token_ratings.append( sorted( ratings, reverse = True )[ index ] )
        
    return token_ratings


def create_summaries( entries, number_of_sentences ) :
    
    summaries = []
    
    # Declare the output directory
    #output_dir = "output"
    
    # Clear all existing outputs
    #delete_dir_contents( output_dir )
    
    # Get minimum ratings
    token_minimum_ratings = get_tokens_minimum_ratings( entries, number_of_sentences )
    
    # Iterate through the tokens
    for entry_index, entry in enumerate( entries ) :
        
        #if entry_index < 10 :

            # Declare the summary sentences list
            #sentences = [ entry[ "tokens" ][ 0 ][ "score" ][ "sentence" ] ] if len( entry[ "tokens" ] ) > 0 else [] 
            sentences = []

            # Iterate through the tokens
            for token_index, token in enumerate( entry[ "tokens" ] ) :

                # Only use sentences that score above the minimum token
                
                if token.has_key( "score" ) :
                    
                    if token[ "score" ][ "rating" ] >= token_minimum_ratings[ entry_index ] :

                        #Add the sentence to the sentences
                        sentences.append( token[ "score" ][ "sentence"] )

            # Generate a new filename based on a numerical sequence
            #new_filename = generate_next_filename( output_dir, ".txt" )

            # Write the new filename
            # write_data_to_file( new_filename, " ".join( sentences ) )

            summaries.append( " ".join( sentences ) )
            
    return summaries

#### Create the raw files

In [11]:
# Generate the raw files
generate_raw_files( "./source", "./target" )

#### Generate tokens

In [13]:
# Cases tokens
tokenized_cases = tokenize_files_in_path( "./target/cases" )

In [14]:
# Summaries tokens
tokenized_summaries = tokenize_files_in_path( "./target/summaries" )

#### Create the word2vec models

In [16]:
# Generate cases word2vec model
cases_word2vec_model = create_word2vec_model( tokenized_cases )

In [17]:
# Generate summaries word2vec model
summaries_word2vec_model = create_word2vec_model( tokenized_summaries )

In [18]:
# Generate general (cases + summaries ) word2vec model
general_word2vec_model = create_word2vec_model( tokenized_cases + tokenized_summaries )

#### Test word2vec models (check similarities)

In [21]:
# Get synonyms based on the cases model
print get_synonyms( cases_word2vec_model, "murder", 0.8 )

['manslaughter', 'kidnapping', 'convicted', 'audiotapes', 'defilement']


In [20]:
# Get synonyms based on the summaries model
print get_synonyms( summaries_word2vec_model, "murder", 0.8 )

['robbery', 'counts', 'violence', 'implying', 'convicted']


In [22]:
# Get synonyms based on the general model
print get_synonyms( general_word2vec_model, "murder", 0.8 )

['manslaughter', 'incarcerated', 'kidnapping']


#### Score tokens

In [46]:
# Score the cases
scored_case_tokens = score_tokenized_sentences( tokenized_cases, general_word2vec_model )

In [47]:
# Score the summaries
scored_summary_tokens = score_tokenized_sentences( tokenized_summaries, general_word2vec_model )

#### Created the summaries

In [48]:
case_summaries = create_summaries( scored_case_tokens, number_of_sentences = 15 )

In [49]:
summary_summaries = create_summaries( scored_summary_tokens, number_of_sentences = 15 )

In [50]:
case_summaries[ 0 : 2 ]

["Soon after that appointment,  he  went  for  a  course  which resulted in another appointment in April 1981 as a  clerk/machine  operator. He  worked  in  the above Division from August 1996 up-to December 1999 when he was  transferred to the Real Estate Department of  the  same  bank. So he reported on duty on Tuesday  where  he  learnt from the key holders that the money he had given them had  got  stolen  from the safe. He also suffered embarrassment at the hand of his  neighbours who were present during the house search because he was  believed  to  be  a honest person. After that he continued working in Real Estates until 12th  June  2000 when he received a letter terminating his  services  (exhibit  P10). The  defendant then suspected the plaintiff and the two key holders  in  the  loss  of  the said money. He stated that he was  involved  in  the  termination  exercise since he was the Chairman Staff Commendation and Disciplinary Committee. The position of the law in  regard  

### Evaluate summaries

In [52]:
summaries = load_files_in_path( './target/summaries' )

In [53]:
model = gensim.models.Doc2Vec( vector_size = 250, window = 2, min_count = 5, workers = 11, alpha = 0.025, min_alpha = 0.025 )

In [54]:
tagged_data = [ TaggedDocument( words = word_tokenize( _d.lower() ), tags=[ str( i ) ] ) for i, _d in enumerate( case_summaries[0:250])]

In [56]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("models/cases.model")
print("Model Saved")

Model Saved


In [57]:
test_data = word_tokenize(summaries[ 2 ].lower())
v1 = model.infer_vector(test_data)

In [58]:
model.docvecs.most_similar([v1])

[('96', 0.7527245879173279),
 ('2', 0.7477607727050781),
 ('53', 0.6964675188064575),
 ('28', 0.6769248247146606),
 ('60', 0.6654722094535828),
 ('62', 0.6597374677658081),
 ('201', 0.6585685610771179),
 ('46', 0.6467065215110779),
 ('70', 0.6435794830322266),
 ('104', 0.6376673579216003)]