
### Learn how to  use NLTK

In [1]:
# load required lbraries
import pandas as pd
import numpy as np
import nltk

In [2]:
# making width of the column viewable
pd.set_option('display.max_colwidth', None)

In [3]:
# load data
data = pd.read_csv('data/imdb_labelled.csv')
data.head()

Unnamed: 0,text,label
0,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",0
1,"Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.",0
2,"Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.",0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1


In [4]:
data['label'].value_counts()

1    386
0    362
Name: label, dtype: int64

In [5]:
sample = data.text[0]
sample

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

### Tokens and Bigrams

In [6]:
# import package
from nltk import word_tokenize

# tokenize the sample
sample_tokens = word_tokenize(sample)
sample_tokens[:10]

['A', 'very', ',', 'very', ',', 'very', 'slow-moving', ',', 'aimless', 'movie']

### Creating a list of Bigrams

In [7]:
from nltk import bigrams

# bigrams of the sample
sample_bigragrams = list(bigrams(sample_tokens))
sample_bigragrams[:10]

[('A', 'very'),
 ('very', ','),
 (',', 'very'),
 ('very', ','),
 (',', 'very'),
 ('very', 'slow-moving'),
 ('slow-moving', ','),
 (',', 'aimless'),
 ('aimless', 'movie'),
 ('movie', 'about')]

### Frequency Distribution

In [8]:
from nltk import FreqDist

sample_fd = FreqDist(sample_tokens)
sample_fd.most_common(10)

[(',', 4),
 ('very', 3),
 ('A', 1),
 ('slow-moving', 1),
 ('aimless', 1),
 ('movie', 1),
 ('about', 1),
 ('a', 1),
 ('distressed', 1),
 ('drifting', 1)]

In [9]:
# create a function to accept a text and n nad returns top most common tokens
def text_n(text, n):
    # get text tokens/unigrams     
    tokenize = word_tokenize(text)    
    # get frequency distribution for the unigrams
    fd = FreqDist(tokenize)
    # return the top n most common unigrams
    return fd.most_common(n)

# try sample
text_n(data.text[1], 10)

[('the', 2),
 ('Not', 1),
 ('sure', 1),
 ('who', 1),
 ('was', 1),
 ('more', 1),
 ('lost', 1),
 ('-', 1),
 ('flat', 1),
 ('characters', 1)]

### Document Term Matrix (DTM)

In [10]:
# rep the frequency of terms that occur in a collection of documents
# create a dtm function 
# import package
from sklearn.feature_extraction.text import CountVectorizer

def create_dtm(series):
    # create an instance of countvectorizer
    cv = CountVectorizer()
    # create a dtm from provided series
    dtm = cv.fit_transform(series)
    # convert sparse array to dense array
    dtm = dtm.todense()
    # get column names
    features = cv.get_feature_names_out()
    # create df
    dtm_df = pd.DataFrame(dtm, columns=features)
    
    return dtm_df
# try on sample with 5 rows
create_dtm(data.text.head())

Unnamed: 0,about,acting,aimless,almost,and,angles,anything,artiness,as,attempting,...,trying,very,walked,was,when,white,who,whom,with,young
0,1,0,1,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
2,0,1,0,1,3,1,0,1,1,1,...,0,0,0,1,0,1,0,0,1,0
3,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0


### Feature Importance

In [11]:
# define a function named top_n_tokens
# import logistic regression
from sklearn.linear_model import LogisticRegression

def top_n_tokens(text, sentiment, n):
    # create instance of the class
    lrg = LogisticRegression(solver = 'lbfgs', max_iter = 2500, random_state = 1234)
    cv = CountVectorizer()
    # create dtm
    dtm = cv.fit_transform(text)
    # fit logistic regression model
    lrg.fit(dtm , sentiment)
    # get the coefficient
    coefs = lrg.coef_[0]
    # create the feature / column names
    features = cv.get_feature_names_out()
    # create a dataframe
    df = pd.DataFrame({"Tokens": features, "Coefficients": coefs})
    
    # return the largest n
    return df.nlargest(n, 'Coefficients')


# try with sample
top_n_tokens(data.text, data.label, 10)

Unnamed: 0,Tokens,Coefficients
1567,liked,1.286747
2997,wonderful,1.242158
1104,funny,1.112821
1182,great,1.068772
2949,well,1.043139
246,beautiful,1.042833
0,10,1.035405
344,brilliant,1.01408
908,excellent,1.009914
2203,right,0.985806


In [12]:
# smallest coefficeint
def top_n_tokens(text, sentiment, n):
    # create instance of the class
    lrg = LogisticRegression(solver = 'lbfgs', max_iter = 2500, random_state = 1234)
    cv = CountVectorizer()
    # create dtm
    dtm = cv.fit_transform(text)
    # fit logistic regression model
    lrg.fit(dtm , sentiment)
    # get the coefficient
    coefs = lrg.coef_[0]
    # create the feature / column names
    features = cv.get_feature_names_out()
    # create a dataframe
    df = pd.DataFrame({"Tokens": features, "Coefficients": coefs})
    
    # return the largest n
    return df.nsmallest(n, 'Coefficients')


# try with sample
top_n_tokens(data.text, data.label, 10)

Unnamed: 0,Tokens,Coefficients
222,bad,-1.872751
211,awful,-1.334554
2530,stupid,-1.175416
441,cheap,-1.139512
1802,no,-1.137234
893,even,-1.091436
3017,would,-1.047931
3012,worst,-1.039231
2923,waste,-1.038206
1819,nothing,-0.973472


### Pre-trained ---- TextBlob

In [13]:
# define function called polarity_subjectivity
# import Textblob
from textblob import TextBlob

def polarity_subjectivity(text=sample, print_results=False):
    # create an instance of Textblob
    ttb = TextBlob(text)
    # if condition is met print results otherwise return the tuple
    if print_results:
        print(f"Polarity is {round(tb.sentiment[0], 2)} and Subjectivity is {round(tb.sentiment[1], 2)}.")
    
    

SyntaxError: invalid syntax (2003511723.py, line 2)