# Initiation

In [1]:
import pandas as pd
import nltk
import re
import string

In [2]:
%%time
"""Loading Data"""
pd.set_option('display.max_colwidth', 100)
data = pd.read_csv('data/SST/SST-2/train.tsv', sep='\t')
test = pd.read_csv('data/SST/SST-2/test.tsv', sep='\t')
dev = pd.read_csv('data/SST/SST-2/dev.tsv', sep='\t')

Wall time: 57 ms


In [3]:
""" Exploring the dataset"""
data

Unnamed: 0,sentence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates something rather beautiful about human nature,1
3,remains utterly satisfied to remain the same throughout,0
4,on the worst revenge-of-the-nerds clichés the filmmakers could dredge up,0
...,...,...
67344,a delightful comedy,1
67345,"anguish , anger and frustration",0
67346,"at achieving the modest , crowd-pleasing goals it sets for itself",1
67347,a patient viewer,1


In [4]:
"""Exploring NLTK"""

'Exploring NLTK'

https://realpython.com/nltk-nlp-python/

# Text Preprocessing

In [5]:
"""Remove punctuations"""
import string
print(string.punctuation)
def remove_punc(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data['sentence_clean'] = data['sentence'].apply(lambda x: remove_punc(x))
data

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Unnamed: 0,sentence,label,sentence_clean
0,hide new secretions from the parental units,0,hide new secretions from the parental units
1,"contains no wit , only labored gags",0,contains no wit only labored gags
2,that loves its characters and communicates something rather beautiful about human nature,1,that loves its characters and communicates something rather beautiful about human nature
3,remains utterly satisfied to remain the same throughout,0,remains utterly satisfied to remain the same throughout
4,on the worst revenge-of-the-nerds clichés the filmmakers could dredge up,0,on the worst revengeofthenerds clichés the filmmakers could dredge up
...,...,...,...
67344,a delightful comedy,1,a delightful comedy
67345,"anguish , anger and frustration",0,anguish anger and frustration
67346,"at achieving the modest , crowd-pleasing goals it sets for itself",1,at achieving the modest crowdpleasing goals it sets for itself
67347,a patient viewer,1,a patient viewer


In [6]:
"""Tokenization"""
# tokenized = []
# text = nltk.word_tokenize(text)
# text

def tokenize(text):
    return re.split('\W+', text)

data['sentence_tokenized'] = data['sentence_clean'].apply(lambda x: tokenize(x.lower()))
data

Unnamed: 0,sentence,label,sentence_clean,sentence_tokenized
0,hide new secretions from the parental units,0,hide new secretions from the parental units,"[hide, new, secretions, from, the, parental, units, ]"
1,"contains no wit , only labored gags",0,contains no wit only labored gags,"[contains, no, wit, only, labored, gags, ]"
2,that loves its characters and communicates something rather beautiful about human nature,1,that loves its characters and communicates something rather beautiful about human nature,"[that, loves, its, characters, and, communicates, something, rather, beautiful, about, human, na..."
3,remains utterly satisfied to remain the same throughout,0,remains utterly satisfied to remain the same throughout,"[remains, utterly, satisfied, to, remain, the, same, throughout, ]"
4,on the worst revenge-of-the-nerds clichés the filmmakers could dredge up,0,on the worst revengeofthenerds clichés the filmmakers could dredge up,"[on, the, worst, revengeofthenerds, clichés, the, filmmakers, could, dredge, up, ]"
...,...,...,...,...
67344,a delightful comedy,1,a delightful comedy,"[a, delightful, comedy, ]"
67345,"anguish , anger and frustration",0,anguish anger and frustration,"[anguish, anger, and, frustration, ]"
67346,"at achieving the modest , crowd-pleasing goals it sets for itself",1,at achieving the modest crowdpleasing goals it sets for itself,"[at, achieving, the, modest, crowdpleasing, goals, it, sets, for, itself, ]"
67347,a patient viewer,1,a patient viewer,"[a, patient, viewer, ]"


In [7]:
"""Remove stopwords"""
# import nltk
# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(tokenized_list):
    return [word for word in tokenized_list if word not in stopwords]

data['sentence_nonstop'] = data['sentence_tokenized'].apply(lambda x: remove_stopwords(x))
data

Unnamed: 0,sentence,label,sentence_clean,sentence_tokenized,sentence_nonstop
0,hide new secretions from the parental units,0,hide new secretions from the parental units,"[hide, new, secretions, from, the, parental, units, ]","[hide, new, secretions, parental, units, ]"
1,"contains no wit , only labored gags",0,contains no wit only labored gags,"[contains, no, wit, only, labored, gags, ]","[contains, wit, labored, gags, ]"
2,that loves its characters and communicates something rather beautiful about human nature,1,that loves its characters and communicates something rather beautiful about human nature,"[that, loves, its, characters, and, communicates, something, rather, beautiful, about, human, na...","[loves, characters, communicates, something, rather, beautiful, human, nature, ]"
3,remains utterly satisfied to remain the same throughout,0,remains utterly satisfied to remain the same throughout,"[remains, utterly, satisfied, to, remain, the, same, throughout, ]","[remains, utterly, satisfied, remain, throughout, ]"
4,on the worst revenge-of-the-nerds clichés the filmmakers could dredge up,0,on the worst revengeofthenerds clichés the filmmakers could dredge up,"[on, the, worst, revengeofthenerds, clichés, the, filmmakers, could, dredge, up, ]","[worst, revengeofthenerds, clichés, filmmakers, could, dredge, ]"
...,...,...,...,...,...
67344,a delightful comedy,1,a delightful comedy,"[a, delightful, comedy, ]","[delightful, comedy, ]"
67345,"anguish , anger and frustration",0,anguish anger and frustration,"[anguish, anger, and, frustration, ]","[anguish, anger, frustration, ]"
67346,"at achieving the modest , crowd-pleasing goals it sets for itself",1,at achieving the modest crowdpleasing goals it sets for itself,"[at, achieving, the, modest, crowdpleasing, goals, it, sets, for, itself, ]","[achieving, modest, crowdpleasing, goals, sets, ]"
67347,a patient viewer,1,a patient viewer,"[a, patient, viewer, ]","[patient, viewer, ]"


In [8]:
"""Stemming"""
# Process of reducing inflected (or sometimes derived) words to their word stem or root
# Crudely chopping off the end of the word to leave only the base
# e.g. Electricity/electrical --> Electr
#      Berries/Berry --> Berri
#      Connection/Connected/Connetive --> Connect
#      Meanness/Meaning --> Mean  (Not good!)
# computationally fast / not accurate - might not return words in dictionary

# nltk.download('wordnet')
sm = nltk.PorterStemmer()

def stem(tokenized_text):
    return [sm.stem(word) for word in tokenized_text]

data['sentence_stemmed'] = data['sentence_nonstop'].apply(lambda x: stem(x))
data

Unnamed: 0,sentence,label,sentence_clean,sentence_tokenized,sentence_nonstop,sentence_stemmed
0,hide new secretions from the parental units,0,hide new secretions from the parental units,"[hide, new, secretions, from, the, parental, units, ]","[hide, new, secretions, parental, units, ]","[hide, new, secret, parent, unit, ]"
1,"contains no wit , only labored gags",0,contains no wit only labored gags,"[contains, no, wit, only, labored, gags, ]","[contains, wit, labored, gags, ]","[contain, wit, labor, gag, ]"
2,that loves its characters and communicates something rather beautiful about human nature,1,that loves its characters and communicates something rather beautiful about human nature,"[that, loves, its, characters, and, communicates, something, rather, beautiful, about, human, na...","[loves, characters, communicates, something, rather, beautiful, human, nature, ]","[love, charact, commun, someth, rather, beauti, human, natur, ]"
3,remains utterly satisfied to remain the same throughout,0,remains utterly satisfied to remain the same throughout,"[remains, utterly, satisfied, to, remain, the, same, throughout, ]","[remains, utterly, satisfied, remain, throughout, ]","[remain, utterli, satisfi, remain, throughout, ]"
4,on the worst revenge-of-the-nerds clichés the filmmakers could dredge up,0,on the worst revengeofthenerds clichés the filmmakers could dredge up,"[on, the, worst, revengeofthenerds, clichés, the, filmmakers, could, dredge, up, ]","[worst, revengeofthenerds, clichés, filmmakers, could, dredge, ]","[worst, revengeofthenerd, cliché, filmmak, could, dredg, ]"
...,...,...,...,...,...,...
67344,a delightful comedy,1,a delightful comedy,"[a, delightful, comedy, ]","[delightful, comedy, ]","[delight, comedi, ]"
67345,"anguish , anger and frustration",0,anguish anger and frustration,"[anguish, anger, and, frustration, ]","[anguish, anger, frustration, ]","[anguish, anger, frustrat, ]"
67346,"at achieving the modest , crowd-pleasing goals it sets for itself",1,at achieving the modest crowdpleasing goals it sets for itself,"[at, achieving, the, modest, crowdpleasing, goals, it, sets, for, itself, ]","[achieving, modest, crowdpleasing, goals, sets, ]","[achiev, modest, crowdpleas, goal, set, ]"
67347,a patient viewer,1,a patient viewer,"[a, patient, viewer, ]","[patient, viewer, ]","[patient, viewer, ]"


In [9]:
"""Lemmatizing"""
# Process of grouping together the inflected forms of a word so they can be analyzed as a single term, -
# - identified by the word's lemma
# Using vocabulary analysis of words aiming to remove inflectional endings to return the dictionary form of a word
# computationally expensive / accurate - returns a word in dictionary

# nltk.download('wordnet')
lm = nltk.WordNetLemmatizer()

def lammitize(tokenized_text):
    return [lm.lemmatize(word) for word in tokenized_text]

data['sentence_lemmatized'] = data['sentence_nonstop'].apply(lambda x: lammitize(x))
data

Unnamed: 0,sentence,label,sentence_clean,sentence_tokenized,sentence_nonstop,sentence_stemmed,sentence_lemmatized
0,hide new secretions from the parental units,0,hide new secretions from the parental units,"[hide, new, secretions, from, the, parental, units, ]","[hide, new, secretions, parental, units, ]","[hide, new, secret, parent, unit, ]","[hide, new, secretion, parental, unit, ]"
1,"contains no wit , only labored gags",0,contains no wit only labored gags,"[contains, no, wit, only, labored, gags, ]","[contains, wit, labored, gags, ]","[contain, wit, labor, gag, ]","[contains, wit, labored, gag, ]"
2,that loves its characters and communicates something rather beautiful about human nature,1,that loves its characters and communicates something rather beautiful about human nature,"[that, loves, its, characters, and, communicates, something, rather, beautiful, about, human, na...","[loves, characters, communicates, something, rather, beautiful, human, nature, ]","[love, charact, commun, someth, rather, beauti, human, natur, ]","[love, character, communicates, something, rather, beautiful, human, nature, ]"
3,remains utterly satisfied to remain the same throughout,0,remains utterly satisfied to remain the same throughout,"[remains, utterly, satisfied, to, remain, the, same, throughout, ]","[remains, utterly, satisfied, remain, throughout, ]","[remain, utterli, satisfi, remain, throughout, ]","[remains, utterly, satisfied, remain, throughout, ]"
4,on the worst revenge-of-the-nerds clichés the filmmakers could dredge up,0,on the worst revengeofthenerds clichés the filmmakers could dredge up,"[on, the, worst, revengeofthenerds, clichés, the, filmmakers, could, dredge, up, ]","[worst, revengeofthenerds, clichés, filmmakers, could, dredge, ]","[worst, revengeofthenerd, cliché, filmmak, could, dredg, ]","[worst, revengeofthenerds, clichés, filmmaker, could, dredge, ]"
...,...,...,...,...,...,...,...
67344,a delightful comedy,1,a delightful comedy,"[a, delightful, comedy, ]","[delightful, comedy, ]","[delight, comedi, ]","[delightful, comedy, ]"
67345,"anguish , anger and frustration",0,anguish anger and frustration,"[anguish, anger, and, frustration, ]","[anguish, anger, frustration, ]","[anguish, anger, frustrat, ]","[anguish, anger, frustration, ]"
67346,"at achieving the modest , crowd-pleasing goals it sets for itself",1,at achieving the modest crowdpleasing goals it sets for itself,"[at, achieving, the, modest, crowdpleasing, goals, it, sets, for, itself, ]","[achieving, modest, crowdpleasing, goals, sets, ]","[achiev, modest, crowdpleas, goal, set, ]","[achieving, modest, crowdpleasing, goal, set, ]"
67347,a patient viewer,1,a patient viewer,"[a, patient, viewer, ]","[patient, viewer, ]","[patient, viewer, ]","[patient, viewer, ]"


In [10]:
"""Lemmatizing vs Stemming"""

'Lemmatizing vs Stemming'

# Text representation - TF-IDF Vectorization

In [11]:
def clean_text(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+', text_nopunct)
    text = [sm.stem(word) for word in tokens if word not in stopwords]
    return text

In [12]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['sentence'])
print(X_tfidf.shape)


(67349, 10782)
Wall time: 5.51 s


In [16]:
%%time
X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names()
X_tfidf_df

Wall time: 1.37 s


Unnamed: 0,Unnamed: 1,1,10,100,10000,100minut,101,103minut,105,10cours,...,zish,ziyi,zoe,zombi,zombieland,zone,zoom,zwick,zzzzzzzzz,élan
0,0.056857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.064563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.058931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.053163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.054485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67344,0.122613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67345,0.069053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67346,0.056691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67347,0.090751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


A good visual explanation of tf-idf:

https://towardsdatascience.com/tf-idf-a-visual-explainer-and-python-implementation-on-presidential-inauguration-speeches-2a7671168550


# Modeling and Pattern Mining

In [21]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_df, data['label'], test_size=0.2)

In [27]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

Wall time: 3min 10s


In [24]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

[(0.029792421730815463, 0),
 (0.010963951634204903, 6476),
 (0.009846435547805012, 786),
 (0.0049169889744338675, 3984),
 (0.004824584195921789, 3462),
 (0.004484174970105125, 894),
 (0.004391072662983779, 972),
 (0.004375640402347129, 5288),
 (0.004237105529714623, 5623),
 (0.004172767553374376, 6936)]

In [25]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')

In [26]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.901 / Recall: 0.89 / Accuracy: 0.884


In [53]:
rf_model.score

<bound method ClassifierMixin.score of RandomForestClassifier(n_estimators=50, n_jobs=-1)>