## Stemming 
Takes any word as input and gives root word as output. The real meaning might be lost.

In [88]:
paragraph = "This is first line number. This is second line number. I am very happy to be here finally."

In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 900 kB/s eta 0:00:01
[?25hCollecting joblib
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[K     |████████████████████████████████| 297 kB 1.1 MB/s eta 0:00:01
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp37-cp37m-macosx_10_9_x86_64.whl (294 kB)
[K     |████████████████████████████████| 294 kB 793 kB/s eta 0:00:01
Installing collected packages: regex, joblib, nltk
Successfully installed joblib-1.2.0 nltk-3.8.1 regex-2022.10.31
You should consider upgrading via the '/Users/hpillai/.pyenv/versions/3.7.7/bin/python -m pip install --upgrade pip' command.[0m


In [16]:
import nltk

In [47]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/hpillai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hpillai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hpillai/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [90]:
# sent_tokenize : convert paragraph(corpus) to sentence
sentences = nltk.sent_tokenize(paragraph)
sentences

['This is first line number.',
 'This is second line number.',
 'I am very happy to be here finally.']

In [91]:
# Tokenization : convert sentence to words
nltk.word_tokenize(paragraph)

['This',
 'is',
 'first',
 'line',
 'number',
 '.',
 'This',
 'is',
 'second',
 'line',
 'number',
 '.',
 'I',
 'am',
 'very',
 'happy',
 'to',
 'be',
 'here',
 'finally',
 '.']

In [68]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem("finally") # gives root word

'final'

In [69]:
stemmer.stem("history") 

'histori'

In [70]:
stemmer.stem("Going") 

'go'

### Disadvantage of stemming
The meaning of the word changes. 
Example: 
1) "going" changed to "go" \
2) "finally" changed to "final"

## Lemmatization
It looksup into dictionary and returns meaningful word if present

#### WordNet Lemmatizer

Lemmatize using WordNet’s built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.

In [71]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("dogs")

'dog'

In [72]:
lemmatizer.lemmatize("dog") # returns word unchanged as it cannot be found in the WordNet

'dog'

In [73]:
corpus = []
def stem_tokens():
    for i in range(len(sentences)):
        word_token = nltk.word_tokenize(sentences[i])
        words =[stemmer.stem(word) for word in word_token]
        print(words)
        corpus.append(" ".join(words))
stem_tokens()

['thi', 'is', 'first', 'line', 'number', '.']
['thi', 'is', 'second', 'line', 'number', '.']
['i', 'am', 'veri', 'happi', 'to', 'be', 'here', 'final', '.']


In [74]:
print(corpus)

['thi is first line number .', 'thi is second line number .', 'i am veri happi to be here final .']


#### applying lemmatizer in above sentence


In [75]:
corpus = []
def lemm_tokens():
    for i in range(len(sentences)):
        word_token = nltk.word_tokenize(sentences[i])
        words =[lemmatizer.lemmatize(word) for word in word_token]
        print(words)
        corpus.append(" ".join(words))
lemm_tokens()        

['This', 'is', 'first', 'line', 'number', '.']
['This', 'is', 'second', 'line', 'number', '.']
['I', 'am', 'very', 'happy', 'to', 'be', 'here', 'finally', '.']


In [76]:
print(corpus)

['This is first line number .', 'This is second line number .', 'I am very happy to be here finally .']


## StopWords 

In [123]:
from nltk.corpus import stopwords
stop_words = stopwords.words("English")
stop_words.remove(negative) for negative in ['not','no','don',"don't",'']
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', '

#### Example 1 : Stemming on paragraph


In [97]:
second_paragraph = """
Mohandas Karamchand Gandhi[41] was born on 2 October 1869[42] into a Gujarati Hindu Modh Bania family[43][44] in Porbandar (also known as Sudamapuri), a coastal town on the Kathiawar Peninsula and then part of the small princely state of Porbandar in the Kathiawar Agency of the British Raj. His father, Karamchand Uttamchand Gandhi (1822–1885), served as the dewan (chief minister) of Porbandar state.[45][5] His family originated from the then village of Kutiana in what was then Junagadh State.[46]
"""

In [98]:
second_paragraph

'\nMohandas Karamchand Gandhi[41] was born on 2 October 1869[42] into a Gujarati Hindu Modh Bania family[43][44] in Porbandar (also known as Sudamapuri), a coastal town on the Kathiawar Peninsula and then part of the small princely state of Porbandar in the Kathiawar Agency of the British Raj. His father, Karamchand Uttamchand Gandhi (1822–1885), served as the dewan (chief minister) of Porbandar state.[45][5] His family originated from the then village of Kutiana in what was then Junagadh State.[46]\n'

In [105]:
second_sentence = nltk.sent_tokenize(second_paragraph)
second_sentence # A list of sentences

['\nMohandas Karamchand Gandhi[41] was born on 2 October 1869[42] into a Gujarati Hindu Modh Bania family[43][44] in Porbandar (also known as Sudamapuri), a coastal town on the Kathiawar Peninsula and then part of the small princely state of Porbandar in the Kathiawar Agency of the British Raj.',
 'His father, Karamchand Uttamchand Gandhi (1822–1885), served as the dewan (chief minister) of Porbandar state.',
 '[45][5] His family originated from the then village of Kutiana in what was then Junagadh State.',
 '[46]']

In [115]:
second_corpus = []
for i in range(len(second_sentence)):
    words = nltk.word_tokenize(second_sentence[i])
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    second_corpus.append(" ".join(words))

In [116]:
second_corpus

['mohanda karamchand gandhi [ 41 ] born 2 octob 1869 [ 42 ] gujarati hindu modh bania famili [ 43 ] [ 44 ] porbandar ( also known sudamapuri ) , coastal town kathiawar peninsula part small princ state porbandar kathiawar agenc british raj .',
 'hi father , karamchand uttamchand gandhi ( 1822–1885 ) , serv dewan ( chief minist ) porbandar state .',
 '[ 45 ] [ 5 ] hi famili origin villag kutiana junagadh state .',
 '[ 46 ]']

In [None]:
for i in range(len(second_sentence)):
    words = nltk.word_tokenize(second_sentence[i])
    #### Example 1 : Stemming on paragraph
words = [word for word in words if word not in stop_words]
    second_corpus.append(" ".join(words))

#### Example 2 : Lemmatize on paragraph


In [120]:
third_corpus = []
for i in range(len(second_sentence)):
    words = nltk.word_tokenize(second_sentence[i].lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    third_corpus.append(" ".join(words))

In [121]:
third_corpus

['mohandas karamchand gandhi [ 41 ] born 2 october 1869 [ 42 ] gujarati hindu modh bania family [ 43 ] [ 44 ] porbandar ( also known sudamapuri ) , coastal town kathiawar peninsula part small princely state porbandar kathiawar agency british raj .',
 'father , karamchand uttamchand gandhi ( 1822–1885 ) , served dewan ( chief minister ) porbandar state .',
 '[ 45 ] [ 5 ] family originated village kutiana junagadh state .',
 '[ 46 ]']

In [134]:
!pip uninstall pytube3
Y

Found existing installation: pytube3 9.6.4
Uninstalling pytube3-9.6.4:
  Would remove:
    /Users/hpillai/.pyenv/versions/3.7.7/bin/pytube3
    /Users/hpillai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytube/*
    /Users/hpillai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytube3-9.6.4.dist-info/*
  Would not remove (might be manually added):
    /Users/hpillai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytube/contrib/channel.py
    /Users/hpillai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytube/contrib/search.py
    /Users/hpillai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytube/innertube.py
    /Users/hpillai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytube/metadata.py
    /Users/hpillai/.pyenv/versions/3.7.7/lib/python3.7/site-packages/pytube/parser.py
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m


NameError: name 'Y' is not defined

In [None]:
Y

In [129]:
from pytube import Vimeo

# Create a Vimeo object with the video URL
url = 'https://i.vimeocdn.com/video/1474656009-dd9c2bc88fb69742434c68956c3aa2f4aedbf902f6b36981ba5c97271aad63ce-d?mw=1600&mh=1000&q=70'
vimeo = Vimeo(url)

# Get the video stream
stream = vimeo.streams.get_by_itag(18)

# Download the video file
filename = 'video.mp4'
stream.download(filename)

ModuleNotFoundError: No module named 'pytube3'