### TF-IDF model

In [13]:
!pip install nltk



In [1]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('max_colwidth', 100)

In [7]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ganesh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
stopwords.words("english")

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/ganesh/nltk_data'
    - '/Users/ganesh/opt/anaconda3/nltk_data'
    - '/Users/ganesh/opt/anaconda3/share/nltk_data'
    - '/Users/ganesh/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


#### Let's build a basic bag of words model on three sample documents

In [8]:
documents = ["Gangs of Wasseypur is a great movie. Wasseypur is a town in Bihar.", "The success of a song depends on the music.", "There is a new movie releasing this week. The movie is fun to watch."]
print(documents)

['Gangs of Wasseypur is a great movie. Wasseypur is a town in Bihar.', 'The success of a song depends on the music.', 'There is a new movie releasing this week. The movie is fun to watch.']


In [9]:
documents = ["Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline",
             "The beer at Vapour, Bangalore was amazing. My favorites are the wheat beer and the ale beer.",
             "Vapour, Bangalore has the best view in Bangalore."]
print(documents)

['Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline', 'The beer at Vapour, Bangalore was amazing. My favorites are the wheat beer and the ale beer.', 'Vapour, Bangalore has the best view in Bangalore.']


In [10]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

# add stemming and lemmatisation in the preprocess function
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    # stem
    #words = [stemmer.stem(word) for word in words]
    
    # join words to make sentence
    document = " ".join(words)
    
    return document

In [5]:
documents = [preprocess(document) for document in documents]
print(documents)

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/ganesh/nltk_data'
    - '/Users/ganesh/opt/anaconda3/nltk_data'
    - '/Users/ganesh/opt/anaconda3/share/nltk_data'
    - '/Users/ganesh/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# import nltk 
# nltk.download('punkt')

#### Creating bag of words model using count vectorizer function

In [None]:
vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(documents)
print(tfidf_model)  # returns the row number and column number of cells which have 1 as value

In [None]:
# print the full sparse matrix
print(tfidf_model.toarray())

In [None]:
pd.DataFrame(tfidf_model.toarray(), columns = vectorizer.get_feature_names())

### Let's create a tf-idf model on the spam dataset.

In [None]:
# load data
spam = pd.read_csv("SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
spam.head()

##### Let's take a subset of data (first 50 rows only) and create bag of word model on that.

In [None]:
spam = spam.iloc[0:50,:]
print(spam)

In [None]:
# extract the messages from the dataframe
messages = [message for message in spam.message]
print(messages)

In [None]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
print(messages)

In [None]:
# bag of words model
vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(messages)

In [None]:
# Let's look at the dataframe
tfidf = pd.DataFrame(tfidf_model.toarray(), columns = vectorizer.get_feature_names())
tfidf

In [None]:
# token names
print(vectorizer.get_feature_names())