In [16]:
import os

import pandas as pd
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English # Import the English language class
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import reuters

nlp = spacy.load('en_core_web_sm')
#nlp = English()
#nlp = spacy.laod('en')

path = 'data/'
arr = os.listdir(path)
for filename in arr:
    if filename.endswith('.txt'):
        xpath = os.path.join(path,filename)
        with open(xpath, 'r') as file:
            contents = file.read()
            doc = nlp(contents)
            entities = [(ent.label_, ent.text, filename) for ent in doc.ents]
            df = pd.DataFrame(entities, columns =['type', 'text', 'filename']) 

In [17]:
tokenizer = PunktSentenceTokenizer()
path = 'data/'
arr = os.listdir(path)
data = []
for filename in arr:
    if filename.endswith('.txt'):
        xpath = os.path.join(path,filename)
        with open(xpath, 'r') as file:
            contents = file.read()
            data.extend(tokenizer.tokenize(contents))
            

In [18]:
df = pd.DataFrame(data, columns =['text']) 
df.to_excel('data/punktsentences.xlsx')
df.to_csv('data/punktsenteces.csv',index=False)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [20]:
data = pd.read_csv('data/markedsentences.csv', )

data.head()

Unnamed: 0,relevant,type,sentence
0,0,0,monson.
1,0,0,"maine history 1822 - 1972 history of monson, m..."
2,0,0,this history could not have been compiled with...
3,0,0,the main sources of information for this book ...
4,0,0,may this history bring pleasure to the readers...


In [21]:
data.shape

(1350, 3)

In [22]:
data.relevant.value_counts()

0    1320
1      30
Name: relevant, dtype: int64

In [31]:
# Count vectorizer, we get the occurence count of words.
# Count does not account for word importance.
# To fix this use the tfidf algorithm, which will downscale the score for the words that appear often, and 
#    therefore will give more importance to the words that have significance but occur in small portions.
# The description column will be used with the tfidf vectorizer

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

#categories = data['type']
categories = data['relevant']
desc = data['sentence']

vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(desc)
 
# features.shape

x, x_test, y, y_test = train_test_split(features,categories,test_size=0.2,train_size=0.8, random_state = 0)

clf = MultinomialNB().fit(x, y)
predicted = clf.predict(x_test)
 
def printreport(exp, pred):
    print(pd.crosstab(exp, pred, rownames=['Actual'], colnames=['Predicted']))
 
    print('\n \n')
    print(classification_report(exp, pred))
 
printreport(y_test, predicted)

Predicted    0
Actual        
0          265
1            5

 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       265
           1       0.00      0.00      0.00         5

    accuracy                           0.98       270
   macro avg       0.49      0.50      0.50       270
weighted avg       0.96      0.98      0.97       270



  _warn_prf(average, modifier, msg_start, len(result))


# Run against unseen document

In [52]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer

In [64]:
test = pd.read_csv('data/test.csv', )
test['Relevant'] = 0
test

Unnamed: 0,Sentence,Relevant
0,"history of veazie, maine by jean hamilton veaz...",0
1,"it was published in 1853 by s.s. smith, 17 wes...",0
2,"veazie, maine samuel veazie 1787-1868 1 this b...",0
3,2 preface i have thoroughly enjoyed doing the ...,0
4,my will appreciate my effort... thanks to the ...,0
...,...,...
1823,it was published in 1853 by s.s.,0
1824,"smith, 17 west market square, bangor.",0
1825,"henry f. walling, 81 washington st.",0
1826,"boston, mass.",0


In [65]:
data

Unnamed: 0,relevant,type,sentence
0,0,0,monson.
1,0,0,"maine history 1822 - 1972 history of monson, m..."
2,0,0,this history could not have been compiled with...
3,0,0,the main sources of information for this book ...
4,0,0,may this history bring pleasure to the readers...
...,...,...,...
1345,0,0,roebuck and co.
1346,0,0,gene johnson 42 main street authorized catalog...
1347,0,0,"15, monson, maine telephone 997-3327 history o..."
1348,0,0,chase 8: klmball oil company large enough to p...


In [76]:
punctuations = string.punctuation# Create our list of punctuation marks
nlp = spacy.load('en') # Create list of stopwords
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()# Load English tokenizer, tagger, parser, NER and word vectors

def spacy_tokenizer(sentence):# Creating tokenizer function
    
    mytokens = parser(sentence)# Creating token object, used to create documents with linguistic annotations.
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ] # Removing stop words
    
    return mytokens# return preprocessed list of tokens

# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()



In [79]:
# model generation
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([('cleaner', predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(data['sentence'],data['relevant'])

pipe.predict(test['Sentence'])

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [80]:
test.sort_values('Relevant')

Unnamed: 0,Sentence,Relevant
0,"history of veazie, maine by jean hamilton veaz...",0
1225,the grocer railroad station.,0
1224,he used to run a horse and fannie drove her ne...,0
1223,in the early 1900s rufus dwelley lived in the ...,0
1222,gentleman in town had occasion to tell me this...,0
...,...,...
603,late but pro- ceeded ahead instead of pulling ...,0
602,westbound mcrr train was 11/2 hrs.,0
601,39 train wreck (april 1895) train wreck (april...,0
599,"1955, maine central railroad bought the stock ...",0


In [81]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
# Predicting with a test dataset

test['Relevant'] = pipe.predict_proba(test['Sentence'])[:,1]

# build a filter for only what I want to see

Resolve = test['Relevant'] 


# Model Accuracy
#print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
#print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
#print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

In [82]:
test.sort_values('Relevant')

Unnamed: 0,Sentence,Relevant
1642,"*frank c. turner cushman 1928 - f. c. turner, ...",0.000003
24,and possess all the property which at the date...,0.000030
1562,"committee, april 6, no new school buy fire tru...",0.000041
1643,"1916 - r. p. hathorn, bert l. king, laurence s...",0.000156
1547,"sewers march 31, rescind town hall, buy grange...",0.000217
...,...,...
735,"fourth row l to r: unknown, pauline smith, alb...",0.606009
1326,"in 1910 brother, fred, was born in 1872. his m...",0.616878
1323,"the 88 william jordan, our eldest living male ...",0.783078
376,"ralph shorey, francina gamble, alice shorey, r...",0.798783


In [23]:
test.loc[1075,'Sentence']

'1891 - ingerson doane 1923 - charles e. goode, edward bulles, charles 1892 - ingerson doane inman 1893 - fred sproul 1924 - charles e. goode 1894 - roderick p. hathorn 1925 - william n. jordan 1895 - fred sproul 1926 - william jordan 1896 - lewis c. inman 1927 - william n. jordan 1897 - calvin inman 1928 - william n. jordan 1898 - lewis c. inman 1929 to 1942 - harry e. smith 1899 - thomas inman (chief), fred sproul, william 1943 - william n. jordan swears, r.p.'

In [21]:
test.loc[376,'Sentence']

'ralph shorey, francina gamble, alice shorey, ruth shorey, carolyn gamble 2. ivan son of harold wentworth 3. james gamble with a few trout 4. ralph shorey with a bear 5. norma and janet, daughters of charles hersey 6. eugene, son of james gamble 7. james, son of harold wentworth 8. bernice stevens, granddaughter of james parks in the late 1940s bangor hydro electric company edward graham dedicated the new plant, graham station.'

In [24]:
test.loc[1323,'Sentence']

'the 88 william jordan, our eldest living male resident, was veazie and married grace may turner, daughter of mary born in north brewer, at the bend, on april 22, 1886.'

In [25]:
test.loc[1326,'Sentence']

'in 1910 brother, fred, was born in 1872. his mother, ellen they again moved to peabody, massachusetts and stayed (johnston) jordan, born in north brewer, met and married five years.'

In [26]:
test.loc[735,'Sentence']

'fourth row l to r: unknown, pauline smith, albert obrien, richard jones, anna kennett, john ellis first row (near blackboard) front to back: leon vanaken, wilbur braley, lucy spurting, irving wentworth, francina gamble second row front to back, buddy rogers, raymond mckay, dorothy hollis, mary spencer, unknown, evelyn ames third row front to back: irene shorey, freeland jones, mary chapman, eleanor spencer, lois miles, lyndon sidelinker, vivian lambert fourth row front to back: eugene gamble, madeline prouty, frank jordan, elliot sidelinker, alice shorey 55 john r. graham school (1935 or 1936) melvin leighton, teacher first row (near blackboard) front to back: douglas doane, george bell, richard )ones, pauline smith, ella spencer, albert obrien second row front to back: eleanor spencer, norman bateman, wilbur braley jr., jean mack, anna kennett, thelma bateman, evelyn spencer third row front to back: audrey bell, unknown, helen chapman, betty shorey, unknown, irving wentworth, francin

In [118]:
Resolve

0       0.003524
1       0.047910
2       0.009701
3       0.010646
4       0.008948
          ...   
1823    0.011052
1824    0.015838
1825    0.036445
1826    0.010114
1827    0.011266
Name: Relevant, Length: 1828, dtype: float64

# ----- STOP LINE-------