In [20]:
import os
import numpy as np
from scipy import sparse
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
from mpl_toolkits.mplot3d import Axes3D, axes3d
from scipy.cluster import hierarchy
import seaborn as sns

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

In [22]:
from sklearn.datasets import load_files

In [23]:
!tree aclImdb

폴더 PATH의 목록입니다.
볼륨 일련 번호가 000000B0 0481:35AC입니다.
C:\USERS\ADMIN\EXERCISE_BOOK\ML_WITH_PYTHON\ACLIMDB
├─test
│  ├─neg
│  └─pos
├─train
│  ├─neg
│  └─pos
└─unsup


In [24]:
reviews_train = load_files('aclImdb/train/')

In [25]:
text_train, y_train = reviews_train['data'], reviews_train['target']

In [26]:
text_train = [doc.replace(b'<br />', b' ') for doc in text_train]

In [27]:
np.bincount(y_train)

array([12500, 12500], dtype=int64)

In [28]:
len(text_train)

25000

In [29]:
reviews_test = load_files('aclImdb/test/')

In [30]:
text_test, y_test = reviews_test['data'], reviews_test['target']

In [31]:
text_test = [doc.replace(b'<br />', b' ') for doc in text_test]

In [32]:
bard_words = ['The fool doth think he is wise,', 'but the wise man knows himself to be a fool']

In [33]:
vect = CountVectorizer().fit(bard_words)
vect.vocabulary_, sorted(vect.vocabulary_.items())

({'the': 9,
  'fool': 3,
  'doth': 2,
  'think': 10,
  'he': 4,
  'is': 6,
  'wise': 12,
  'but': 1,
  'man': 8,
  'knows': 7,
  'himself': 5,
  'to': 11,
  'be': 0},
 [('be', 0),
  ('but', 1),
  ('doth', 2),
  ('fool', 3),
  ('he', 4),
  ('himself', 5),
  ('is', 6),
  ('knows', 7),
  ('man', 8),
  ('the', 9),
  ('think', 10),
  ('to', 11),
  ('wise', 12)])

In [34]:
vect.transform(bard_words).toarray()

array([[0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1],
       [1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1]], dtype=int64)

In [35]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
X_train

<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>

In [36]:
vect.get_feature_names_out()[20000:20100]

array(['draper', 'draperies', 'drapery', 'drapes', 'draskovic', 'drastic',
       'drastically', 'drat', 'dratch', 'dratic', 'dratted', 'draub',
       'draught', 'draughts', 'draughtswoman', 'draw', 'drawback',
       'drawbacks', 'drawer', 'drawers', 'drawing', 'drawings', 'drawl',
       'drawled', 'drawling', 'drawn', 'draws', 'draza', 'dre', 'drea',
       'dread', 'dreadcentral', 'dreaded', 'dreadful', 'dreadfull',
       'dreadfully', 'dreading', 'dreadlocks', 'dreads', 'dreaful',
       'dream', 'dreama', 'dreamboat', 'dreamcast', 'dreamcatcher',
       'dreamcatchers', 'dreamed', 'dreamer', 'dreamers', 'dreamgirl',
       'dreamgirls', 'dreamily', 'dreamin', 'dreaming', 'dreamland',
       'dreamless', 'dreamlike', 'dreamquest', 'dreams', 'dreamscape',
       'dreamscapes', 'dreamstate', 'dreamt', 'dreamtime', 'dreamworks',
       'dreamworld', 'dreamy', 'drearily', 'dreariness', 'dreary',
       'dreck', 'drecky', 'dredd', 'dredge', 'dredged', 'dregs',
       'dreichness', 'd

In [37]:
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=5000), param_grid=param_grid, n_jobs=-1).fit(X_train, y_train)

In [38]:
grid.best_params_

{'C': 0.1}

In [40]:
X_test = vect.transform(text_test)
grid.score(X_test, y_test)

0.87896

In [42]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
X_test = vect.transform(text_test)

In [43]:
feature_names = vect.get_feature_names_out()
feature_names[:100]

array(['00', '000', '007', '00s', '01', '02', '03', '04', '05', '06',
       '07', '08', '09', '10', '100', '1000', '100th', '101', '102',
       '103', '104', '105', '107', '108', '10s', '10th', '11', '110',
       '112', '116', '117', '11th', '12', '120', '12th', '13', '135',
       '13th', '14', '140', '14th', '15', '150', '15th', '16', '160',
       '1600', '16mm', '16s', '16th', '17', '17th', '18', '180', '1800',
       '1800s', '1830', '1840', '1860', '1876', '1880', '1890', '1890s',
       '1895', '1898', '18th', '19', '1900', '1900s', '1902', '1909',
       '1910', '1912', '1913', '1914', '1915', '1916', '1917', '1918',
       '1919', '1920', '1920s', '1921', '1922', '1924', '1925', '1926',
       '1927', '1928', '1929', '1930', '1930s', '1931', '1932', '1933',
       '1934', '1935', '1936', '1937', '1938'], dtype=object)

In [44]:
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=5000), param_grid=param_grid, n_jobs=-1).fit(X_train, y_train)

In [None]:
grid.best_score_ grid.best_params_, grid.score(X)