### Legal Text Summarization

#### Import libraries/packages

In [260]:
import os
import shutil
import textract
import numpy as np

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import corpus, pos_tag 
import gensim 
import enchant
import re
import math

import logging
import pprint

#### Configure pretty print

In [261]:
pp = pprint.PrettyPrinter( indent = 4, width = 150 )

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

#### Define english dictionary

In [262]:
dictionary = enchant.Dict("en_US")

#### Define all required methods

In [379]:
def get_files_in_path( path ) :
    
    for root, dirs, files in os.walk( path ) :
        
        for filename in files :
            
            yield root + "/" + filename
            
            
def load_text_from_file( filename ) :
    
    # Load the data
    contents = textract.process( filename, encoding='ascii' )
    
    # Clean the data
    contents = contents
    #contents = re.sub( '\[\d+\]', '', contents.strip() )
    #contents = re.sub( '\d+\.', '', contents.strip() )
    #contents = contents.replace("\n", " .")

    return contents


def delete_dir_contents( path ) :
    
    shutil.rmtree( path )
    
    os.makedirs( path )
    
    
def number_of_files_in_path( path ) :
    
    return len( next( os.walk( path ) )[ 2 ] )


def get_file_extension( filename ) :
    
    return os.path.splitext( filename )[ 1 ]


def get_file_basename( filename ) :
    
    return os.path.basename( filename )


def is_supported_file_type( filename ) :
    
    # A list of all supported extensions
    valid_extensions = [ ".doc", ".docx", ".pdf", ".rtf" ]
    
    return valid_extensions.count( get_file_extension( filename ) ) == 1


def generate_next_filename( path, extension ) :
    
    return path + "/" + str( number_of_files_in_path( path ) + 1 ) + extension


def write_data_to_file( filename, data ) :
    
    # Open the file for writing
    file = open( filename, 'w' )
    
    # Write the data to the file
    file.write( data )
    
    # Close the file
    file.close()
    
    
def generate_raw_files( source_dir, target_dir ) :
    
    # Clear all existing raw cases
    delete_dir_contents( target_dir + "/cases" )

    # Clear all existing raw summaries
    delete_dir_contents( target_dir + "/summaries" )
    
    for filename in get_files_in_path( source_dir ) :
    
        # Get the file extension
        extension = get_file_extension( filename )

        if is_supported_file_type( filename ) :

            # Load the file contents
            contents = load_text_from_file( filename )

            # Generate a new filename based on a numerical sequence
            new_filename = generate_next_filename( target_dir + "/" + ( "summaries" if "1.0" in filename else "cases" ), ".txt" )

            # Write the new filename
            write_data_to_file( new_filename, contents )
    
    
def tokenize_file_contents( filename ) :
    
    # Load the stop words
    stop_words = corpus.stopwords.words('english')
    
    # Load the file contents
    contents = load_text_from_file( filename )
    
    # Define the filename tokens
    content_tokens = []
    
    # Tokenize the sentences from the file contents
    sentence_tokens = sent_tokenize( contents )
    
    for sentence in sentence_tokens :
        
        # Tokenize the words from the sentence
        word_tokens = word_tokenize( sentence )
        
        # Remove the punctuation and convert the words to lowercase
        words = [ word.lower() for word in word_tokens if word.isalpha() and dictionary.check( word ) ]
        
        # Remove the stop words
        words = [ word for word in words if not word in stop_words ]
        
        # Append the sentence and tokens to the list
        content_tokens.append( { 'sentence': sentence, 'words': words } )
        
    return content_tokens


def tokenize_files_in_path( path ) :
    
    # Create a list of tokens
    tokens = []
    
    for filename in get_files_in_path( path ) :
        
        # Append a dictionary to the tokens
        tokens.append( { "filename": get_file_basename( filename ), "tokens": tokenize_file_contents( filename ) } )
        
    return tokens


def create_word2vec_model( entries ) :
    
    # Create documents list
    documents = []
    
    for entry in entries :
        
        # Create a file token
        file_tokens = []
        
        for sentence in entry[ "tokens" ] :
            
            # Add the word tokens in the document
            file_tokens =  file_tokens + sentence[ "words" ]
            
        # Add the file tokens to the document
        documents.append( file_tokens )
    
    model = gensim.models.Word2Vec ( documents, size = 50, window = 5, min_count = 1, workers = 100 )
    
    model.train( documents, total_examples = len( documents ), epochs = 10 )
    
    return model


def get_synonyms( model, word, min_similarity = 0.85 ) :
    
    # Define the similar words
    synonyms = []
    
    # Get related word2vec words
    suggested_words = model.wv.most_similar( positive = word, topn = 5 )
    
    # Limit words to a given similarity
    for word in suggested_words :
        
        if word[ 1 ] > min_similarity :
        
            # Add word to 
            synonyms.append( word[ 0 ] )
            
    return synonyms


def generate_score( weight, sentence_position, total_sentences ) :
    
    new_value = float( sentence_position ) / total_sentences
    
    return math.sin( math.radians( math.radians( new_value * 90 ) ) ) * weight


def score_tokenized_sentences( file_entries ):
    
    # Iterate through the files
    for file_index, file_entry in enumerate( file_entries ) :
        
        # Define the cleaned sentences
        cleaned_sentences = []
        
        # Iterate through the tokens
        for token_index, token in enumerate( file_entry['tokens' ] ):
            
            # Add the words to the cleaned sentences
            cleaned_sentences.append( " ".join( str( word ) for word in token['words'] ) )
            
        # Create a word vectorizer
        vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) 

        # Train the vectorizer
        train_data_features = vectorizer.fit_transform( cleaned_sentences )
    
        for token_index, cleaned_sentence in enumerate( cleaned_sentences ) :
            
            # Create the bag of words
            bag_of_words = vectorizer.transform( [ cleaned_sentence ] ).toarray()[ 0 ]
            
            # Define the sentence score
            sentence_score = 0
            
            # Iterate through the bag of words
            for score_index, score in enumerate( bag_of_words ) :
                
                # Increment the sentence score by the score
                sentence_score = sentence_score + score
                
            # Get the sentence tags
            pos_tags = pos_tag( file_entries[ file_index ][ 'tokens' ][ token_index ][ 'words' ] )
            
            # Define the tags
            tags = []
            
            for tag in enumerate( pos_tags ) :
                
                tags.append( tag[ 1 ][ 1 ] )
                
            weight = float( sentence_score ) / len( bag_of_words )
            total_sentences = len( cleaned_sentences )
            sentence_position = token_index + 1
                
            # Create the store dictionary
            score = { 
                'weight': weight, 
                'total_sentences': total_sentences, 
                'sentence_position': sentence_position, 
                'tense': 'past' if 'VBD' in tags else 'present' if 'VBG' in tags else 'uncategorized',
                'sentence': file_entries[ file_index ][ 'tokens' ][ token_index ][ 'sentence' ],
                'rating': generate_score( weight, sentence_position, total_sentences )
            }
        
            file_entries[ file_index ][ "tokens" ][ token_index ][ "score" ] = score
            
    return file_entries


def get_tokens_minimum_ratings( scored_tokens, number_of_sentences = 5 ) :
    
    token_ratings = []
    
    for entry_index, scored_token in enumerate( scored_tokens ) :
        
        ratings = []
        
        for sentence_index, sentence in enumerate( scored_token[ "tokens" ] ) :
            
            ratings.append( sentence[ "score" ][ "rating" ] )
            
        total_ratings = len( ratings )
        
        index = ( number_of_sentences if number_of_sentences < total_ratings else total_ratings ) - 1
        
        token_ratings.append( sorted( ratings, reverse = True )[ index ] )
        
    return token_ratings


def create_summaries( entries, number_of_sentences ) :
    
    # Clear all existing outputs
    delete_dir_contents( "output" )
    
    # Get minimum ratings
    token_minimum_ratings = get_tokens_minimum_ratings( entries, number_of_sentences )
    
    # Iterate through the tokens
    for entry_index, entry in enumerate( entries ) :
        
        # Declare the summary sentences list
        #sentences = [ entry[ "tokens" ][ 0 ][ "score" ][ "sentence" ] ] if len( entry[ "tokens" ] ) > 0 else [] 
        sentences = []
        
        for token_index, token in enumerate( entry[ "tokens" ] ) :
    
            if token[ "score" ][ "rating" ] >= token_minimum_ratings[ token_index ] :
            
                sentences.append( token[ "score" ][ "sentence"].replace( "\n\n", "\n" ) )
            
        if entry_index < 50 :

            print " ".join( sentences )
            
            print "\n\n\n\n==================================\n"

#### Create the raw files

In [264]:
# Generate the raw files
generate_raw_files( "./source", "./target" )

#### Generate tokens

In [265]:
# Cases tokens
tokenized_cases = tokenize_files_in_path( "./target/cases" )

In [266]:
# Summaries tokens
tokenized_summaries = tokenize_files_in_path( "./target/summaries" )

#### Create the word2vec models

In [267]:
# Generate cases word2vec model
cases_word2vec_model = create_word2vec_model( tokenized_cases )

In [268]:
# Generate summaries word2vec model
summaries_word2vec_model = create_word2vec_model( tokenized_summaries )

In [269]:
# Generate general (cases + summaries ) word2vec model
general_word2vec_model = create_word2vec_model( tokenized_cases + tokenized_summaries )

#### Test word2vec models (check similarities)

In [270]:
# Get synonyms based on the cases model
print get_synonyms( cases_word2vec_model, "murder", 0.8 )

['manslaughter', 'kidnapping', 'indicted', 'convicted', 'robbery']


In [271]:
# Get synonyms based on the summaries model
print get_synonyms( summaries_word2vec_model, "murder", 0.8 )

['robbery', 'count', 'sentenced', 'violence', 'convicted']


In [272]:
# Get synonyms based on the general model
print get_synonyms( general_word2vec_model, "murder", 0.8 )

['kidnapping', 'manslaughter', 'rape']


#### Score tokens

In [273]:
# Score the cases
scored_case_tokens = score_tokenized_sentences( tokenized_cases )

In [274]:
# Score the summaries
scored_summary_tokens = score_tokenized_sentences( tokenized_summaries )

#### Write the finished data

In [382]:
create_summaries( scored_case_tokens, 5 )







Before dismissing the objection, the Court cited that section and concluded its ruling with the following words:
In the instant case and on its own facts, we hold that the petitioners ought to have perceived of the breach of the Constitution allegedly posed by the Political Parties and Organisations Act on 17 July 2002. We are unable to find any fault with this conclusion and we find no sound foundation upon which Mr Matsiko contended that the Court erred when it held that the petitioners ought to have perceived the breach on 17 July 2002. Therefore, the Constitutional Court was absolutely right in holding that the petition, which by virtue of rule 4(1), was expected to be filed within 30 days from 18 July 2002 was filed within time because it was lodged in Court on 31 July 2002 that is to say 14 days after the 30 days begun running. We will, however, discuss ground 3 which is framed this way:
The Learned Judges of the Constitutional Court erred in law and in fact in holding that

IndexError: list index out of range