In [25]:
import spacy                                            
from spacy.lang.en import English
import nltk                                   
from nltk.stem.snowball import SnowballStemmer

In [27]:
nlp = spacy.load("en_core_web_sm")

# TASK 1

##### Printing no. of  stop-words consists in spaCy and from it displaying first 15 of them...

In [9]:
spaCy_stopwords = spacy.lang.en.stop_words.STOP_WORDS 

print('First fifteen stop words of spaCy :\n %s' % list(spaCy_stopwords)[:15]) 

print('Number of stop words in spaCy : %d' % len(spaCy_stopwords))

First fifteen stop words of spaCy :
 ['she', 'hereafter', 'front', 'and', 'everyone', 'with', 'three', 'yet', 'done', 'other', 'elsewhere', 'your', 'twenty', 'into', 'either']
Number of stop words in spaCy : 326


# TASK 2

##### Stemming:

In [16]:
stemmer=SnowballStemmer(language="english")
token=["cries", "lied", "this", "computing", "organizing", "matches"]

for tokens in token:
    print(tokens+" ----- "+stemmer.stem(tokens))

cries ----- cri
lied ----- lie
this ----- this
computing ----- comput
organizing ----- organ
matches ----- match


##### Lemmatization:

In [28]:
token=nlp("cries lied this computing organizing matches")

for word in token:                                                      
    print(word.text,"-----",word.lemma_)      

cries ----- cry
lied ----- lie
this ----- this
computing ----- compute
organizing ----- organizing
matches ----- match


##### Comparing Stemming vs Lemmatization:

In [34]:
print("BY STEMMING:")
print("\n")
stemmer=SnowballStemmer(language="english")
token=["cries", "lied", "this", "computing", "organizing", "matches"]

for tokens in token:
    print(tokens+" ----- "+stemmer.stem(tokens))

print("\n")

print("BY LEMMATIZATION:")
print("\n")
token=nlp("cries lied this computing organizing matches")

for word in token:                                                      
    print(word.text,"-----",word.lemma_)

BY STEMMING:


cries ----- cri
lied ----- lie
this ----- this
computing ----- comput
organizing ----- organ
matches ----- match


BY LEMMATIZATION:


cries ----- cry
lied ----- lie
this ----- this
computing ----- compute
organizing ----- organizing
matches ----- match


##### We can observe that the Lemmatization is more precise in giving the output for the words in their root form compared to Stemming...!

# TASK 3

In [52]:
file = open("scifiscripts_intro.txt")
contents=file.read()
text=str(contents)
doc=nlp(text)

In [53]:
from spacy.lang.en.stop_words import STOP_WORDS
stop_word_file=[]

for i in doc:
    if i.is_stop==False:
        stop_word_file.append(i)
print("\n \t Whole file after removing stop-words is : \n\n ",stop_word_file)


 	 Whole file after removing stop-words is : 

  [Note, poster, Kubrick, newsgroup, :, 

, found, bbs, ago, thought, pass, 
, Kubrick, freaks, ., 

, 02/23/89, 
, Transcriber, note, :, 

, Clarke, /, Kubrick/2001, fans, ,, 

, found, original, paper, copy, screenplay, felt, 
, compelled, transcribe, disk, upload, bulletin, 
, boards, enjoyment, ., 

, final, movie, deviates, screenplay, number, interesting, 
, ways, ., tried, maintain, format, original, document, 
, number, lines, page, original, ., order, reduce, 
, length, file, bar, ", ------, ", delimit, pages, 
, lot, whitespace, original, screenplay, page, .]


In [56]:
text=(" My mom taught me to finish everything on my plate at dinner.\
The only problem with a pencil, is that they do not stay sharp long enough.\
Our school building is made of bricks.\
Every night I get woken up by the sound of a barking dog across the street.\
Salad is for rabbits. ")

In [58]:
str_file=str(text)
document=nlp(str_file)

stop_word_file2=[]

for i in document:
    if i.is_stop==False:
        stop_word_file2.append(i)

print("\n \t Whole file after removing stop-words is : \n\n ",stop_word_file2)


 	 Whole file after removing stop-words is : 

  [ , mom, taught, finish, plate, dinner, ., problem, pencil, ,, stay, sharp, long, ., school, building, bricks, ., night, woken, sound, barking, dog, street, ., Salad, rabbits, .]


# TASK 4

In [36]:
sentence=nlp("Bryan visited his friend for a while and then went home at 10 pm.")

for word in sentence:
    print(word.text,'\t',word.pos_,'\t',word.tag_)

Bryan 	 PROPN 	 NNP
visited 	 VERB 	 VBD
his 	 PRON 	 PRP$
friend 	 NOUN 	 NN
for 	 ADP 	 IN
a 	 DET 	 DT
while 	 NOUN 	 NN
and 	 CCONJ 	 CC
then 	 ADV 	 RB
went 	 VERB 	 VBD
home 	 ADV 	 RB
at 	 ADP 	 IN
10 	 NUM 	 CD
pm 	 NOUN 	 NN
. 	 PUNCT 	 .


# TASK 5

In [38]:
file = open('Random.txt')
content = file.read()                                                  
text = str(content)                                               
doc = nlp(text) 

for word in doc:                                            
    print(word.text,'\t',word.pos_,'\t',word.tag_)   

PADUA 	 PROPN 	 NNP
HIGH 	 PROPN 	 NNP
SCHOOL 	 PROPN 	 NNP
- 	 PUNCT 	 HYPH
DAY 	 PROPN 	 NNP

 	 SPACE 	 _SP
Revision 	 PROPN 	 NNP
November 	 PROPN 	 NNP
12 	 NUM 	 CD
, 	 PUNCT 	 ,
1997 	 NUM 	 CD

 	 SPACE 	 _SP


* Proper Noun:  
>PADUA  
HIGH  
SCHOOL  
DAY  
REVISION  
November    

* Numericals:
>12  
1997

# TASK 6

##### Adding 5 new stop-words...

In [40]:
nlp.Defaults.stop_words |= {"STOPWORD1","STEPWORD2","STEPWORD3","STEPWORD4","STEPWORD5"}
print(nlp.Defaults.stop_words)                             

{'she', 'hereafter', 'front', 'and', 'everyone', 'with', 'three', 'yet', 'done', 'other', 'elsewhere', 'your', 'twenty', 'into', 'either', 'did', 'from', 'anything', 'its', 'became', 'now', 'whole', 'then', 'can', 'whereafter', 'put', 'some', 'regarding', 'him', "'s", 'latterly', 'though', 'almost', 'her', 'on', 'STOPWORD1', 'no', 'n’t', 'everything', 'whereby', 'ten', 'be', 'me', 'noone', 'i', 'of', 'whom', 'others', 'nevertheless', 'themselves', 'namely', 'really', 'many', 'afterwards', 'beforehand', 'how', 'keep', 'part', 'an', 'even', 'too', 'towards', 'therefore', 'could', 'you', 'although', 'whence', 'may', 'less', 'again', 'after', 'STEPWORD5', 'if', 're', 'someone', 'already', 'alone', 'the', 'make', 'below', 'but', 'ever', 'nor', "'re", 'yours', 'must', 'without', 'whether', 'wherever', 'becomes', 'bottom', 'upon', 'mostly', 'formerly', 'among', 'seeming', 'hundred', 'whereupon', 'my', 'there', 'go', 'used', 'serious', 'show', 'anyway', 'last', 'is', 'herein', 'amongst', 'wher

##### Removing 4 stop-words...

In [41]:
nlp.Defaults.stop_words -= {"always","never","between","becomes"}
print(nlp.Defaults.stop_words)

{'she', 'hereafter', 'front', 'and', 'everyone', 'with', 'three', 'yet', 'done', 'other', 'elsewhere', 'your', 'twenty', 'into', 'either', 'did', 'from', 'anything', 'its', 'became', 'now', 'whole', 'then', 'can', 'whereafter', 'put', 'some', 'regarding', 'him', "'s", 'latterly', 'though', 'almost', 'her', 'on', 'STOPWORD1', 'no', 'n’t', 'everything', 'whereby', 'ten', 'be', 'me', 'noone', 'i', 'of', 'whom', 'others', 'nevertheless', 'themselves', 'namely', 'really', 'many', 'afterwards', 'beforehand', 'how', 'keep', 'part', 'an', 'even', 'too', 'towards', 'therefore', 'could', 'you', 'although', 'whence', 'may', 'less', 'again', 'after', 'STEPWORD5', 'if', 're', 'someone', 'already', 'alone', 'the', 'make', 'below', 'but', 'ever', 'nor', "'re", 'yours', 'must', 'without', 'whether', 'wherever', 'bottom', 'upon', 'mostly', 'formerly', 'among', 'seeming', 'hundred', 'whereupon', 'my', 'there', 'go', 'used', 'serious', 'show', 'anyway', 'last', 'is', 'herein', 'amongst', 'wherein', 'same

# TASK 7

##### Tokenization:

In [47]:
file = open("Raw_data_for_analysis.txt")
contents = file.read()                                    
contents = contents[:]

print(contents)

text_combined = str(contents)
doc = nlp(text_combined) 

for token in doc:
    print(token)
len(token)

PADUA HIGH SCHOOL - DAY
Revision November 12, 1997
I hope dinner's ready because I only have ten minutes before Mrs. Johnson squirts out a screamer.
He grabs the mail and rifles through it, as he bends down to kiss Sharon on the cheek.
MICHAEL- C'mon. I'm supposed to give you the tour. They head out of the office
MICHAEL (continuing)- So -- which Dakota you from?
          
                                 

PADUA
HIGH
SCHOOL
-
DAY


Revision
November
12
,
1997


I
hope
dinner
's
ready
because
I
only
have
ten
minutes
before
Mrs.
Johnson
squirts
out
a
screamer
.


He
grabs
the
mail
and
rifles
through
it
,
as
he
bends
down
to
kiss
Sharon
on
the
cheek
.


MICHAEL-
C'm
on
.
I
'm
supposed
to
give
you
the
tour
.
They
head
out
of
the
office


MICHAEL
(
continuing)-
So
--
which
Dakota
you
from
?

          
                                 



46

##### Removing stop-words:

In [46]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS

In [48]:
filtered_file=[]                                          
doc = nlp(text_combined)

for word in doc:                                  
    if word.is_stop==False:                
        filtered_file.append(word)                   
print("\n \nFiltered Sentence:",filtered_file)         


 
Filtered Sentence: [PADUA, HIGH, SCHOOL, -, DAY, 
, Revision, November, 12, ,, 1997, 
, hope, dinner, ready, minutes, Mrs., Johnson, squirts, screamer, ., 
, grabs, mail, rifles, ,, bends, kiss, Sharon, cheek, ., 
, MICHAEL-, C'm, ., supposed, tour, ., head, office, 
, MICHAEL, (, continuing)-, --, Dakota, ?, 
          
                                 
]


##### Lemmatization:

In [50]:
for word in filtered_file:                
    print(word.text,"-----",word.lemma_)

PADUA ----- PADUA
HIGH ----- HIGH
SCHOOL ----- SCHOOL
- ----- -
DAY ----- DAY

 ----- 

Revision ----- Revision
November ----- November
12 ----- 12
, ----- ,
1997 ----- 1997

 ----- 

hope ----- hope
dinner ----- dinner
ready ----- ready
minutes ----- minute
Mrs. ----- Mrs.
Johnson ----- Johnson
squirts ----- squirt
screamer ----- screamer
. ----- .

 ----- 

grabs ----- grab
mail ----- mail
rifles ----- rifle
, ----- ,
bends ----- bend
kiss ----- kiss
Sharon ----- Sharon
cheek ----- cheek
. ----- .

 ----- 

MICHAEL- ----- michael-
C'm ----- come
. ----- .
supposed ----- suppose
tour ----- tour
. ----- .
head ----- head
office ----- office

 ----- 

MICHAEL ----- MICHAEL
( ----- (
continuing)- ----- continuing)-
-- ----- --
Dakota ----- Dakota
? ----- ?

          
                                 
 ----- 
          
                                 



##### Parts OF Speech (POS) Tagging:

In [51]:
file = open("Raw_data_for_analysis.txt")

contents = file.read()                                 
text = str(contents)                                              
doc = nlp(text)

for word in doc:          
    print(word.text,'\t',word.pos_,'\t',word.tag_)

PADUA 	 PROPN 	 NNP
HIGH 	 PROPN 	 NNP
SCHOOL 	 PROPN 	 NNP
- 	 PUNCT 	 HYPH
DAY 	 PROPN 	 NNP

 	 SPACE 	 _SP
Revision 	 PROPN 	 NNP
November 	 PROPN 	 NNP
12 	 NUM 	 CD
, 	 PUNCT 	 ,
1997 	 NUM 	 CD

 	 SPACE 	 _SP
I 	 PRON 	 PRP
hope 	 VERB 	 VBP
dinner 	 NOUN 	 NN
's 	 PART 	 POS
ready 	 ADJ 	 JJ
because 	 SCONJ 	 IN
I 	 PRON 	 PRP
only 	 ADV 	 RB
have 	 VERB 	 VBP
ten 	 NUM 	 CD
minutes 	 NOUN 	 NNS
before 	 SCONJ 	 IN
Mrs. 	 PROPN 	 NNP
Johnson 	 PROPN 	 NNP
squirts 	 VERB 	 VBZ
out 	 ADP 	 RP
a 	 DET 	 DT
screamer 	 NOUN 	 NN
. 	 PUNCT 	 .

 	 SPACE 	 _SP
He 	 PRON 	 PRP
grabs 	 VERB 	 VBZ
the 	 DET 	 DT
mail 	 NOUN 	 NN
and 	 CCONJ 	 CC
rifles 	 NOUN 	 NNS
through 	 ADP 	 IN
it 	 PRON 	 PRP
, 	 PUNCT 	 ,
as 	 SCONJ 	 IN
he 	 PRON 	 PRP
bends 	 VERB 	 VBZ
down 	 ADP 	 RP
to 	 PART 	 TO
kiss 	 VERB 	 VB
Sharon 	 PROPN 	 NNP
on 	 ADP 	 IN
the 	 DET 	 DT
cheek 	 NOUN 	 NN
. 	 PUNCT 	 .

 	 SPACE 	 _SP
MICHAEL- 	 X 	 ADD
C'm 	 VERB 	 VBP
on 	 ADP 	 RP
. 	 PUNCT 	 .
I 	 PRON 	 PRP
'm