
## Assignment 7

1. Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document Frequency.

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial import distance
import pandas as pd
import numpy as np

### Read data

In [None]:
with open('./paragraph.txt') as f:
    paragraph = f.read()
    paragraph = paragraph.lower()

In [None]:
paragraph

### Tokenization
Tokenization is the first step when working with language tasks, it simplifies the input data by splitting it into sentences or words, as per the requirement

In [None]:
# Sentence tokenization
sentence_tokens = sent_tokenize(paragraph)

In [None]:
print('Number of sentence tokens :', len(sentence_tokens))
print('Sentence tokens :', sentence_tokens)

In [None]:
# Word tokenization
word_tokens = word_tokenize(paragraph)

In [None]:
print('Number of word tokens :', len(word_tokens))
print('Word tokens :', word_tokens)

### POS Tagging and Stop words removal

In [None]:
stop_words = set(stopwords.words('english'))
print('Stop words :', stop_words)

In [None]:
word_tokens = [word_token for word_token in word_tokens if word_token not in stop_words]

In [None]:
print('Filtered word tokens :', word_tokens)

In [None]:
'''
CC coordinating conjunction 
CD cardinal digit 
DT determiner 
EX existential there (like: “there is” … think of it like “there exists”) 
FW foreign word 
IN preposition/subordinating conjunction 
JJ adjective – ‘big’ 
JJR adjective, comparative – ‘bigger’ 
JJS adjective, superlative – ‘biggest’ 
LS list marker 1) 
MD modal – could, will 
NN noun, singular ‘- desk’ 
NNS noun plural – ‘desks’ 
NNP proper noun, singular – ‘Harrison’ 
NNPS proper noun, plural – ‘Americans’ 
PDT predeterminer – ‘all the kids’ 
POS possessive ending parent’s 
PRP personal pronoun –  I, he, she 
PRP$ possessive pronoun – my, his, hers 
RB adverb – very, silently, 
RBR adverb, comparative – better 
RBS adverb, superlative – best 
RP particle – give up 
TO – to go ‘to’ the store. 
UH interjection – errrrrrrrm 
VB verb, base form – take 
VBD verb, past tense – took 
VBG verb, gerund/present participle – taking 
VBN verb, past participle – taken 
VBP verb, sing. present, non-3d – take 
VBZ verb, 3rd person sing. present – takes 
WDT wh-determiner – which 
WP wh-pronoun – who, what 
WP$ possessive wh-pronoun, eg- whose 
WRB wh-abverb, eg- where, when
'''
tagged = nltk.pos_tag(word_tokens)

In [None]:
print('POS Tagged form of filtered word tokens :')
for tag in tagged:
    print(tag)

### Stemming

In [None]:
ps = PorterStemmer()

In [None]:
print('Results of Stemming')
stemmed = {word: ps.stem(word) for word in word_tokens}
for pair in stemmed.items():
    print('{0} --> {1}'.format(pair[0], pair[1]))

### Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
print('Results of Lemmatization')
lemmatized = {word: lemmatizer.lemmatize(word) for word in word_tokens}
for pair in lemmatized.items():
    print('{0} --> {1}'.format(pair[0], pair[1]))

### Term-Frequency and Inverse Document Frequency

In [None]:
def arr_convert_1d(arr):
    arr = np.array(arr)
    arr = np.concatenate( arr, axis=0 )
    arr = np.concatenate( arr, axis=0 )
    return arr

In [None]:
cos = []
def cosine(trans):
    cos.append(cosine_similarity(trans[0], trans[1]))

In [None]:
manhatten = []
def manhatten_distance(trans):
    manhatten.append(pairwise_distances(trans[0], trans[1], metric = 'manhattan'))

In [None]:
euclidean = []
def euclidean_function(vectors):
    euc=euclidean_distances(vectors[0], vectors[1])
    euclidean.append(euc)

In [None]:
def tfidf(str1, str2):
    vect = TfidfVectorizer()
    vect.fit(word_tokens)
    corpus = [str1,str2]
    trans = vect.transform(corpus)
    euclidean_function(trans)
    cosine(trans)
    manhatten_distance(trans)
    return convert()

In [None]:
def convert():
    dataf = pd.DataFrame()
    lis2 = arr_convert_1d(manhatten)
    dataf['manhattan'] = lis2
    lis2 = arr_convert_1d(cos)
    dataf['cos_sim'] = lis2
    lis2 = arr_convert_1d(euclidean)
    dataf['euclidean'] = lis2
    return dataf

In [None]:
str1 = 'rsfhcui'
str2 = 'ukjiorgd'
newData = tfidf(str1,str2);
print(newData);