## Objective
The work in this notebook is for practicing tfidf concepts and building a function which takes in documents and outputs a tfidf dataframe.

## Term frequency

## Inverse Document Frequency

## References

In [137]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import stop_words
import spacy
import pandas as pd
from string import punctuation

In [2]:
s = "An accessory dwelling unit or detached accessory dwelling unit (sometimes called a mother-in-law apartment) is a separate living space within a house or on the same property as an existing house. These units aren’t legal unless they have been established through a permit process. A legally permitted unit in the home is called an accessory dwelling unit (ADU). A legally permitted unit on the property (but not within the home) is called a backyard cottage or detached accessory dwelling unit (DADU). The property owner must live in either the house or the attached or detached accessory dwelling unit. Tiny houses, with foundations, are considered DADUs."

In [3]:
s

'An accessory dwelling unit or detached accessory dwelling unit (sometimes called a mother-in-law apartment) is a separate living space within a house or on the same property as an existing house. These units aren’t legal unless they have been established through a permit process. A legally permitted unit in the home is called an accessory dwelling unit (ADU). A legally permitted unit on the property (but not within the home) is called a backyard cottage or detached accessory dwelling unit (DADU). The property owner must live in either the house or the attached or detached accessory dwelling unit. Tiny houses, with foundations, are considered DADUs.'

In [138]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Use spacy to split the input into list of sentences

In [4]:
model = spacy.load('en')

In [5]:
doc = model(s)

### Use scikit-learn to calculate the tfidf values

In [65]:
stopwords = set(stop_words.ENGLISH_STOP_WORDS)

In [139]:
stopwords.update(punctuation)

In [140]:
stopwords

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'bill',
 'both',
 'bottom',
 'but',
 'by',
 'call',
 'can',
 'cannot',
 'cant',
 'co',
 'con',
 'could',
 'couldnt',
 'cry',
 'de',
 'describe',
 'detail',
 'do',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eg',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'e

### Use spacy tokenizer

In [43]:
nlp = spacy.load('en')


In [118]:
def spacy_tok(text):
    doc = nlp(text)
    return [tok.lemma_ for tok in doc]

In [121]:
type(spacy_tok(s_list[0])[0])

str

In [122]:
vectorizor = TfidfVectorizer(stop_words=stopwords, tokenizer=spacy_tok)

In [123]:
s_list = [sent.text for sent in doc.sents]

In [124]:
s_list

['An accessory dwelling unit or detached accessory dwelling unit (sometimes called a mother-in-law apartment) is a separate living space within a house or on the same property as an existing house.',
 'These units aren’t legal unless they have been established through a permit process.',
 'A legally permitted unit in the home is called an accessory dwelling unit (ADU).',
 'A legally permitted unit on the property (but not within the home) is called a backyard cottage or detached accessory dwelling unit (DADU).',
 'The property owner must live in either the house or the attached or detached accessory dwelling unit.',
 'Tiny houses, with foundations, are considered DADUs.']

In [125]:
doc_matrix = vectorizor.fit_transform(s_list)

In [126]:
dense_doc = doc_matrix.todense()

In [127]:
dense_doc

matrix([[0.        , 0.3143086 , 0.        , 0.26489992, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.1833935 , 0.36678701, 0.        , 0.        , 0.26489992,
         0.        , 0.        , 0.36678701, 0.26489992, 0.        ,
         0.        , 0.        , 0.26489992, 0.26489992, 0.        ,
         0.        , 0.        , 0.1833935 , 0.26489992, 0.26489992,
         0.        , 0.27143061, 0.        ],
        [0.41732741, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.41732741, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.41732741,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.28892095, 0.41732741, 0.        , 0.        , 0.        ,
         0.        , 0.21380799, 0.41732741],
        [0.        , 0.27349556, 0.46100521, 0.        , 0.        ,
         0.

In [128]:
feat_names = vectorizor.get_feature_names()

In [129]:
col_names = ['sent_{}'.format(idx) for idx in range(len(s_list))]

In [130]:
dd_df = pd.DataFrame(dense_doc.T, index=feat_names, columns=col_names)

In [131]:
dd_df

Unnamed: 0,sent_0,sent_1,sent_2,sent_3,sent_4,sent_5
-PRON-,0.0,0.417327,0.0,0.0,0.0,0.0
accessory,0.314309,0.0,0.273496,0.214299,0.241148,0.0
adu,0.0,0.0,0.461005,0.0,0.0,0.0
apartment,0.2649,0.0,0.0,0.0,0.0,0.0
attach,0.0,0.0,0.0,0.0,0.40648,0.0
backyard,0.0,0.0,0.0,0.361223,0.0,0.0
consider,0.0,0.0,0.0,0.0,0.0,0.472493
cottage,0.0,0.0,0.0,0.361223,0.0,0.0
dadu,0.0,0.0,0.0,0.361223,0.0,0.0
dadus,0.0,0.0,0.0,0.0,0.0,0.472493


In [132]:
for column in dd_df.columns:
    print(dd_df[column].argmax())

dwell
-PRON-
unit
unit
attach
consider


In [136]:
dd_df[dd_df['sent_0'] > 0].index

Index(['accessory', 'apartment', 'detached', 'dwell', 'exist', 'house', 'law',
       'living', 'mother', 'property', 'separate', 'space', 'unit'],
      dtype='object')

An accessory dwelling unit or detached accessory dwelling unit (sometimes called a mother-in-law apartment) is a separate living space within a house or on the same property as an existing house.

In [134]:
sum_words = dd_df.sum(axis=1)

In [135]:
sum_words.sort_values(ascending=False)

unit          1.535988
accessory     1.043251
house         0.975311
dwell         0.936026
permit        0.858160
property      0.714883
detached      0.714883
home          0.674239
legally       0.674239
tiny          0.472493
consider      0.472493
dadus         0.472493
foundation    0.472493
adu           0.461005
establish     0.417327
unless        0.417327
-PRON-        0.417327
process       0.417327
legal         0.417327
attach        0.406480
live          0.406480
dwelling      0.406480
owner         0.406480
cottage       0.361223
backyard      0.361223
dadu          0.361223
mother        0.264900
living        0.264900
exist         0.264900
apartment     0.264900
separate      0.264900
space         0.264900
law           0.264900
dtype: float64