In [8]:
from sklearn.datasets import load_files
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.pipeline import make_pipeline


reviews_train = load_files("data/aclImdb/train/")
reviews_test = load_files("data/aclImdb/test/")
text_train, y_train = reviews_train.data, reviews_train.target

bards_words = ["The fool doth think he is wise,", "but the wise man knows himself to be a fool."]
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}

In [None]:
# 7.2. Example Application: Sentiment Analysis of Movie Reviews

print(f"text_train type: {type(text_train)}")
print(f"text_train len: {len(text_train)}")
print(f"text_train[1]:\n{text_train[1]}")

text_train type: <class 'list'>
text_train len: 75000
text_train[1]:
b"Amount of disappointment I am getting these days seeing movies like Partner, Jhoom Barabar and now, Heyy Babyy is gonna end my habit of seeing first day shows.<br /><br />The movie is an utter disappointment because it had the potential to become a laugh riot only if the d\xc3\xa9butant director, Sajid Khan hadn't tried too many things. Only saving grace in the movie were the last thirty minutes, which were seriously funny elsewhere the movie fails miserably. First half was desperately been tried to look funny but wasn't. Next 45 minutes were emotional and looked totally artificial and illogical.<br /><br />OK, when you are out for a movie like this you don't expect much logic but all the flaws tend to appear when you don't enjoy the movie and thats the case with Heyy Babyy. Acting is good but thats not enough to keep one interested.<br /><br />For the positives, you can take hot actresses, last 30 minutes, some com

In [14]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]
print(f"Samples per class(train): {np.bincount(y_train)}")

Samples per class(train): [12500 12500 50000]


In [20]:
text_test, y_test = reviews_test.data, reviews_test.target
print(f"Number of documents in the test data: {len(text_test)}")
print(f"Samples per class(test): {np.bincount(y_test)}")
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

Number of documents in the test data: 25000
Samples per class(test): [12500 12500]


In [None]:
# 7.3. Representing Text Data as a Bag of Words

# 7.3.1. Applying Bag-of-Words to a Toy Dataset

vect = CountVectorizer()
vect.fit(bards_words)

print(f"Vocabulary size: {len(vect.vocabulary_)}")
print(f"Vocabulary content: {vect.vocabulary_}")

Vocabulary size: 13
Vocabulary content: {'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}


In [6]:
bag_of_words = vect.transform(bards_words)
print(f"bag_of_words: {repr(bag_of_words)}")

bag_of_words: <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 16 stored elements and shape (2, 13)>


In [7]:
print(f"Dense representation of bag_of_words: {bag_of_words.toarray()}")

Dense representation of bag_of_words: [[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


In [10]:
# 7.3.2. Bag-of-Words for Movie Reviews

vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print(f"X_train:\n{repr(X_train)}")

X_train:
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 10359806 stored elements and shape (75000, 124255)>


In [11]:
feature_names = vect.get_feature_names_out()
print(f"Feature length: {len(feature_names)}")
print(f"First 20 features:\n{feature_names[:20]}")
print(f"Features 20010 to 20030:\n{feature_names[20010:20030]}")
print(f"Every 2000th feature:\n{feature_names[::2000]}")

Feature length: 124255
First 20 features:
['00' '000' '0000' '0000000000000000000000000000000001' '0000000000001'
 '000000001' '000000003' '00000001' '000001745' '00001' '0001' '00015'
 '0002' '0007' '00083' '000ft' '000s' '000th' '001' '002']
Features 20010 to 20030:
['cheapen' 'cheapened' 'cheapening' 'cheapens' 'cheaper' 'cheapest'
 'cheapie' 'cheapies' 'cheapjack' 'cheaply' 'cheapness' 'cheapo'
 'cheapozoid' 'cheapquels' 'cheapskate' 'cheapskates' 'cheapy' 'chearator'
 'cheat' 'cheata']
Every 2000th feature:
['00' '_require_' 'aideed' 'announcement' 'asteroid' 'banquière'
 'besieged' 'bollwood' 'btvs' 'carboni' 'chcialbym' 'clotheth'
 'consecration' 'cringeful' 'deadness' 'devagan' 'doberman' 'duvall'
 'endocrine' 'existent' 'fetiches' 'formatted' 'garard' 'godlie' 'gumshoe'
 'heathen' 'honoré' 'immatured' 'interested' 'jewelry' 'kerchner' 'köln'
 'leydon' 'lulu' 'mardjono' 'meistersinger' 'misspells' 'mumblecore'
 'ngah' 'oedpius' 'overwhelmingly' 'penned' 'pleading' 'previlage'
 

In [15]:
scores = cross_val_score(LogisticRegression(solver='liblinear', max_iter=1000), X_train, y_train, cv=5)
print(f"Mean cross-validation accuracy: {np.mean(scores)}")



Mean cross-validation accuracy: 0.7085466666666667


In [None]:
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f"Cross-validation best score: {grid_search.best_score_}")
print(f"Best parameter: {grid_search.best_params_}")



Cross-validation best score: 0.7168
Best parameter: {'C': 0.1}


In [27]:
X_test = vect.transform(text_test)
print(f"{grid_search.score(X_test, y_test):.2f}")

0.13


In [22]:
vect = CountVectorizer(min_df=5).fit(text_test)
X_train = vect.transform(text_train)
print(f"X_train with min_df=5: {repr(X_train)}")

X_train with min_df=5: <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9994708 stored elements and shape (75000, 26749)>


In [24]:
feature_names = vect.get_feature_names_out()
print(f"First 50 features:\n{feature_names[:50]}")
print(f"Features 20010 to 20030:\n{feature_names[20010:20030]}")
print(f"Every 700th feature:\n{feature_names[::700]}")

First 50 features:
['00' '000' '007' '01' '02' '03' '04' '05' '06' '07' '08' '09' '10' '100'
 '1000' '100th' '101' '102' '105' '107' '108' '109' '10th' '11' '110'
 '111' '116' '117' '11th' '12' '120' '125' '12th' '13' '130' '13th' '14'
 '140' '14th' '15' '150' '1500' '15th' '16' '16mm' '16s' '16th' '17' '170'
 '1700']
Features 20010 to 20030:
['riddler' 'riddles' 'ride' 'rider' 'riders' 'rides' 'ridge' 'ridgemont'
 'ridges' 'ridicule' 'ridiculed' 'ridicules' 'ridiculous' 'ridiculously'
 'ridiculousness' 'riding' 'ridley' 'riefenstahl' 'rife' 'riff']
Every 700th feature:
['00' 'affiliates' 'arbitrary' 'baritone' 'boats' 'caddy' 'childless'
 'completist' 'crazily' 'delinquents' 'distinguishable' 'egged'
 'exceptionally' 'fink' 'gain' 'grimace' 'hesitate' 'immune' 'invasions'
 'king' 'lips' 'marthy' 'mistaking' 'netherworld' 'othello' 'permeate'
 'pratfall' 'quip' 'renant' 'rotund' 'selfless' 'skilled' 'spoon'
 'succinctly' 'terminator' 'treasures' 'uns' 'warhol' 'yearning']


In [31]:
grid = GridSearchCV(LogisticRegression(solver='saga', max_iter=1000), param_grid, cv=5)
grid.fit(X_train, y_train)
print(f"Cross-validation best score: {grid.best_score_}")



Cross-validation best score: 0.6885866666666666




In [2]:
# 7.4. Stopwords

print(f"Number of stopwords: {len(ENGLISH_STOP_WORDS)}")
print(f"Every 10th stopword:\n{list(ENGLISH_STOP_WORDS)[::10]}")

Number of stopwords: 318
Every 10th stopword:
['at', 'un', 'a', 'therein', 'we', 'whose', 'yours', 'via', 'perhaps', 'could', 'cant', 'give', 'else', 'beforehand', 'of', 'nine', 'first', 'your', 'you', 'meanwhile', 'latterly', 'done', 'after', 'eg', 'cannot', 'always', 'above', 'then', 'must', 'something', 'ours', 'not']


In [3]:
vect = CountVectorizer(min_df=5, stop_words='english').fit(text_train)
X_train = vect.transform(text_train)
print(f"X_train with stopwords:\n{repr(X_train)}")

X_train with stopwords:
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 6621682 stored elements and shape (75000, 44223)>


In [6]:
grid = GridSearchCV(LogisticRegression(solver='saga', max_iter=1000), param_grid, cv=5)
grid.fit(X_train, y_train)
print(f"Cross-validation best score: {grid.best_score_:.2f}")




Cross-validation best score: 0.71




In [9]:
# 7.5. Rescaling the Data with tf–idf

pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None), LogisticRegression(solver='saga', max_iter=1000))
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print(f"Cross-validation best score: {grid.best_score_:.2f}")



Cross-validation best score: 0.71


In [10]:
vectorizer = grid.best_estimator_.named_steps['tfidfvectorizer']
X_train = vectorizer.transform(text_train)
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
feature_names = np.array(vectorizer.get_feature_names_out())
print(f"Features with lowest tf-idf:\n{feature_names[sorted_by_tfidf[:20]]}")
print(f"Features with highest tf-idf:\n{feature_names[sorted_by_tfidf[-20:]]}")


Features with lowest tf-idf:
['remained' 'acclaimed' 'combines' 'rapidly' 'uniformly' 'diverse'
 'avoiding' 'fills' 'feeble' 'admired' 'wherever' 'admission' 'abound'
 'starters' 'assure' 'pivotal' 'comprehend' 'deliciously' 'strung'
 'inadvertently']
Features with highest tf-idf:
['nukie' 'reno' 'dominick' 'taz' 'ling' 'rob' 'victoria' 'turtles'
 'khouri' 'lorenzo' 'id' 'zizek' 'elwood' 'nikita' 'rishi' 'timon'
 'titanic' 'zohan' 'pammy' 'godzilla']


In [11]:
sorted_by_tfidf = np.argsort(vectorizer.idf_)
print(f"Features with lowest tf-idf:\n{feature_names[sorted_by_tfidf[:100]]}")

Features with lowest tf-idf:
['the' 'and' 'of' 'to' 'this' 'is' 'it' 'in' 'that' 'but' 'for' 'with'
 'was' 'as' 'on' 'movie' 'not' 'br' 'one' 'be' 'have' 'are' 'film' 'you'
 'all' 'at' 'an' 'by' 'from' 'so' 'like' 'who' 'there' 'they' 'his' 'if'
 'out' 'just' 'about' 'he' 'or' 'has' 'what' 'some' 'can' 'good' 'when'
 'more' 'up' 'time' 'very' 'even' 'only' 'no' 'see' 'would' 'my' 'story'
 'really' 'which' 'well' 'had' 'me' 'than' 'their' 'much' 'were' 'get'
 'other' 'do' 'been' 'most' 'also' 'into' 'don' 'her' 'first' 'great'
 'how' 'made' 'people' 'will' 'make' 'because' 'way' 'could' 'bad' 'we'
 'after' 'them' 'too' 'any' 'then' 'movies' 'watch' 'she' 'think' 'seen'
 'acting' 'its']
