# 第6章

## 50.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

SEED = 0

news_corpora = pd.read_csv('data/NewsAggregatorDataset/newsCorpora.csv', sep='\t', header=None, names=['id', 'title', 'url', 'publisher', 'category', 'story', 'hostname', 'timestamp'])

news_corpora = news_corpora.loc[news_corpora['publisher'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['category', 'title']]

train, valid_test = train_test_split(news_corpora, test_size=0.2, random_state=SEED, shuffle=True, stratify=news_corpora['category'])
valid, test = train_test_split(valid_test, test_size=0.5, random_state=SEED, shuffle=True, stratify=valid_test['category'])

train.to_csv('out/train.txt', sep='\t', index=False)
valid.to_csv('out/valid.txt', sep='\t', index=False)
test.to_csv('out/test.txt', sep='\t', index=False)

print('学習データ')
print(train['category'].value_counts())
print()
print('検証データ')
print(valid['category'].value_counts())
print()
print('評価データ')
print(test['category'].value_counts())

学習データ
b    4502
e    4223
t    1219
m     728
Name: category, dtype: int64

検証データ
b    562
e    528
t    153
m     91
Name: category, dtype: int64

評価データ
b    563
e    528
t    152
m     91
Name: category, dtype: int64


## 51.

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_valid = pd.concat([train, valid])

vectorizer = TfidfVectorizer()

train_valid_feature = vectorizer.fit_transform(train_valid['title'])
test_feature = vectorizer.transform(test['title'])

train_valid_feature = pd.DataFrame(train_valid_feature.toarray(), columns=vectorizer.get_feature_names())
test_feature = pd.DataFrame(test_feature.toarray(), columns=vectorizer.get_feature_names())

train_feature = train_valid_feature[:len(train)]
valid_feature = train_valid_feature[len(train):]

train_feature.to_csv('out/train.feature.txt', sep='\t', index=False)
valid_feature.to_csv('out/valid.feature.txt', sep='\t', index=False)
test_feature.to_csv('out/test.feature.txt', sep='\t', index=False)

train_feature.head()

Unnamed: 0,00,07,08,09,0ff,0ut,10,100,1000,10000,...,zombies,zone,zooey,zoosk,zuckerberg,zynga,zâ,œlousyâ,œpiece,œwaist
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 52.

In [3]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_feature, train['category'])

LogisticRegression()

## 53.

In [5]:
import numpy as np

train_pred = lr.predict(train_feature)
train_prob = np.max(lr.predict_proba(train_feature), axis=1)

test_pred = lr.predict(test_feature)
test_prob = np.max(lr.predict_proba(test_feature), axis=1)

(train_pred, train_prob)

(array(['b', 'b', 'e', ..., 'b', 'b', 'e'], dtype=object),
 array([0.61587764, 0.92895613, 0.7513382 , ..., 0.81752671, 0.98021737,
        0.73399155]))

## 54.

In [6]:
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(train['category'], train_pred)
test_accuracy = accuracy_score(test['category'], test_pred)
print(f'正解率（学習データ）：{train_accuracy:.3f}')
print(f'正解率（評価データ）：{test_accuracy:.3f}')

正解率（学習データ）：0.947
正解率（評価データ）：0.885
