In [1]:
import os
import json

def get_data(folder_name):
    x = []
    y = []
    positions = []
    file_names = []

    for file in os.listdir(folder_name):
        if file.endswith(".txt"):
            file_name = os.path.join(folder_name, file[:-4])

            file_text = open(file_name + '.txt', encoding='utf8')
            try:
                file_truth = open(file_name + '.truth', encoding='utf8')

                try:
                    text = file_text.read()
                    truth = json.load(file_truth)
                    truth_changes = truth['changes']
                    truth_positions = truth['positions']

                    x.append(text)
                    y.append(truth_changes)
                    positions.append(truth_positions)
                    file_names.append(file[:-4])
                finally:
                    file_truth.close()
            finally:
                file_text.close()

    return x, y, positions, file_names

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
def custom_sent_tokenize(text):
    sentences = []
    paragraphs = [p for p in text.split('\n') if p]
    for paragraph in paragraphs:
        sentences.extend(sent_tokenize(paragraph))
        
    return sentences

In [3]:
import pandas as pd

TRAINING_DIR = '../data/training'

X, y, positions, file_names = get_data(
    TRAINING_DIR)
df = pd.DataFrame(data={'text': X, 'label': y, 'filename': file_names, 'pos':positions})
df['num_sent'] = [len(custom_sent_tokenize(d)) for d in df.text]
df['num_splits'] = [len(x) for x in df.pos]

In [4]:
from nltk.corpus import stopwords
def only_stop_words(text):
    result = ''
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    for w in words:
        if w in stop_words:
            result=result+' '+ w
        
    return result

In [5]:
def segment_sliding_tokenize(text, n=5):
    segments = []
    sentences = custom_sent_tokenize(text)
    x = len(sentences)
    n = min(n, x)
    for i in range(0, x-n+1):
        segments.append(''.join(sentences[i:i+n]))
    
    return segments

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
import numpy as np
from sklearn.decomposition import TruncatedSVD
from mpl_toolkits.mplot3d import Axes3D
def stop_word_vectors(row, vect):
    scaler = StandardScaler(with_mean=False)

    segments = segment_sliding_tokenize(row.text)
    train_x = vect.transform([only_stop_words(s) for s in segments])
    train_x = scaler.fit_transform(train_x).toarray()
    #train_x = TruncatedSVD(n_components=3).fit_transform(train_x)    
#     fig = plt.figure(i)  
#     ax = Axes3D(fig)
#     ax.scatter(train_x[:, 0], train_x[:, 1], train_x[:, 2])
    n_components = np.arange(1, 4)
    models = [GaussianMixture(n, covariance_type='full', random_state=0).fit(train_x)
             for n in n_components]
    bics = [m.bic(train_x) for m in models]
    best = bics.index(min(bics))
    title = 'Clusters: ' + str(row.num_splits + 1) + ' Predicted: ' + str(best + 1)
    #plt.title(title)
    #print(title)
    correct = row.num_splits == best
    correct_adj = (row.num_splits == 0 and best == 0) or (row.num_splits > 0 and best > 0)
    
    return correct, correct_adj
    

In [7]:
vect = TfidfVectorizer(max_features=50)
vect.fit([only_stop_words(d) for d in df.text])
    
correct_sum = 0
correct_adj_sum = 0
n=len(df)
for i in range(n):
    correct, correct_adj = stop_word_vectors(df.iloc[i], vect)
    if correct:
        correct_sum=correct_sum+1
    if correct_adj:
        correct_adj_sum=correct_adj_sum+1

print('Accuracy: ', correct_sum/n)
print('Adj Accuracy: ', correct_adj_sum/n)

Accuracy:  0.4006711409395973
Adj Accuracy:  0.6214765100671141
