In [1]:
import os
import sys
import numpy as np
from scipy import sparse
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
from mpl_toolkits.mplot3d import Axes3D, axes3d
from scipy.cluster import hierarchy
import seaborn as sns
import spacy
import nltk
from konlpy.tag import Okt
import graphviz
from sklearn.utils.fixes import loguniform

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [2]:
from sklearn.datasets import load_files

In [4]:
reviews_train = load_files('aclImdb/train/')
text_train, y_train = reviews_train['data'], reviews_train['target']
text_train = [doc.replace(b'<br />', b' ') for doc in text_train]

In [6]:
reviews_test = load_files('aclImdb/test/')
text_test, y_test = reviews_test['data'], reviews_test['target']
text_test = [doc.replace(b'<br />', b' ') for doc in text_test]

In [11]:
bards_words = ['The fool doth think he is wise,', 'but the wise man knows himself to be a fool']

In [14]:
vect = CountVectorizer().fit(bards_words)
vect.vocabulary_, len(vect.vocabulary_)

({'the': 9,
  'fool': 3,
  'doth': 2,
  'think': 10,
  'he': 4,
  'is': 6,
  'wise': 12,
  'but': 1,
  'man': 8,
  'knows': 7,
  'himself': 5,
  'to': 11,
  'be': 0},
 13)

In [17]:
vect.transform(bards_words).toarray()

array([[0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1],
       [1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1]], dtype=int64)

In [18]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
X_train

<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>

In [19]:
vect.get_feature_names_out()

array(['00', '000', '0000000000001', ..., 'østbye', 'über', 'üvegtigris'],
      dtype=object)

In [20]:
[text_train[i] for i in np.argwhere(X_train[:, 9] == 1)[:, 0]]

[b'"Dutch Schultz", AKA Arthur Fleggenheimer, was a real person and his rather nasty life is fairly well documented. This movie which purports to depict his life should have used a fictional character, because the overdramatized events are too strong a departure from the facts and the chronology. Not only that, it ignores some interesting details which other versions have included such as the public relations fiasco in upstate N.Y. and his religious conversion. It is true that he was executed by Luciano, Lansky, et. al. but that\'s as far as it goes. The exploding plate scene which represents Luciano carrying out the execution of Bo Weinberg in his own home, assisted by his own mother is rediculous. Also, there is the scene in which Dutch approaches his own mother to pay protection to Legs Diamond. It just doesn\'t work. The character of Mrs. Fleggenheimer doesn\'t work either. This movie does not need a doting Jewish mother for comic relief. The lame representation of Legs Diamond was

In [24]:
np.argwhere(X_train[:, 9] == 1)[:, 0]

array([  554,  6583,  7758,  9637, 10764, 15068, 16627, 17873, 22458,
       23321, 23473])

In [26]:
scores = cross_val_score(LogisticRegression(max_iter=1000), X_train, y_train, n_jobs=-1)
np.mean(scores)

0.88132

In [30]:
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=param_grid, cv=5, n_jobs=-1).fit(X_train, y_train)
grid.best_score_, grid.best_params_

(0.8881599999999998, {'C': 0.1})

In [31]:
X_test = vect.transform(text_test)
grid.score(X_test, y_test)

0.87896

In [32]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
X_test = vect.transform(text_test)
X_train

<25000x27271 sparse matrix of type '<class 'numpy.int64'>'
	with 3354014 stored elements in Compressed Sparse Row format>

In [33]:
vect.get_feature_names_out()

array(['00', '000', '007', ..., 'zuniga', 'zwick', 'émigré'], dtype=object)

In [34]:
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=param_grid, cv=5, n_jobs=-1).fit(X_train, y_train)
grid.best_score_, grid.best_params_

(0.88812, {'C': 0.1})

In [36]:
len(ENGLISH_STOP_WORDS)

318

In [37]:
vect = CountVectorizer(min_df=5, stop_words='english').fit(text_train)
X_train = vect.transform(text_train)
X_test = vect.transform(text_test)
X_train

<25000x26966 sparse matrix of type '<class 'numpy.int64'>'
	with 2149958 stored elements in Compressed Sparse Row format>

In [38]:
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=param_grid, cv=5, n_jobs=-1).fit(X_train, y_train)
grid.best_score_, grid.best_params_

(0.8828400000000001, {'C': 0.1})