## 第6章: 機械学習

In [133]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [8]:
df = pd.read_csv('./NewsAggregatorDataset/newsCorpora.csv', delimiter='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [10]:
target_df = df.query('PUBLISHER == "Reuters" or PUBLISHER == "Huffington Post" or PUBLISHER == "Businessweek" or PUBLISHER == "Contactmusic.com" or PUBLISHER == "Daily Mail"')
target_df['PUBLISHER'].value_counts()

PUBLISHER
Reuters             3902
Huffington Post     2455
Businessweek        2395
Contactmusic.com    2334
Daily Mail          2254
Name: count, dtype: int64

In [38]:
train_df, valid_df = train_test_split(target_df, test_size=0.2, random_state=0)
valid_df, test_df = train_test_split(valid_df, test_size=0.5, random_state=0)
print(len(train_df), len(valid_df), len(test_df))

train_df[['CATEGORY', 'TITLE']].to_csv('sample.txt', sep='\t', index=False, header=False)
valid_df[['CATEGORY', 'TITLE']].to_csv('valid.txt', sep='\t', index=False, header=False)
test_df[['CATEGORY', 'TITLE']].to_csv('test.txt', sep='\t', index=False, header=False)

10672 1334 1334


In [99]:
print(train_df['CATEGORY'].value_counts())
print(valid_df['CATEGORY'].value_counts())
print(test_df['CATEGORY'].value_counts())

CATEGORY
b    4481
e    4240
t    1214
m     737
Name: count, dtype: int64
CATEGORY
b    575
e    528
t    137
m     94
Name: count, dtype: int64
CATEGORY
b    571
e    511
t    173
m     79
Name: count, dtype: int64


In [141]:
from sklearn.feature_extraction.text import TfidfVectorizer

def create_corpus(df):
    corpus = df.TITLE.values.tolist()
    for i in range(len(corpus)):
        text = corpus[i].replace("...", "")
        text = re.sub(r"[!\'\"()*+,-./:;<=>?@]", "", text)
        text = re.sub("\d+", "0", text)
        corpus[i] = text.strip()
    return corpus

train_corpus = create_corpus(train_df)
vectorizer = TfidfVectorizer(analyzer='word', use_idf=True, norm='l2', smooth_idf=True)
fit_vec = vectorizer.fit(train_corpus)

In [148]:
X_train = fit_vec.transform(train_corpus).toarray()
X_train_df = pd.DataFrame(X_train)
X_train_df.to_pickle('train.feature.pickle')

valid_corpus = create_corpus(valid_df)
X_valid = fit_vec.transform(valid_corpus).toarray()
X_valid_df = pd.DataFrame(X_valid)
X_valid_df.to_pickle('valid.feature.pickle')

test_corpus = create_corpus(test_df)
X_test = fit_vec.transform(test_corpus).toarray()
X_test_df = pd.DataFrame(X_test)
X_test_df.to_pickle('test.feature.pickle')

y_train = train_df.CATEGORY
y_valid = valid_df.CATEGORY
y_test = test_df.CATEGORY

In [68]:
# spacyによる文章ベクトルを特徴量として使用
import spacy
from tqdm import tqdm

nlp = spacy.load('en_core_web_sm')

def output_feature(df):
    columns = [f'feature_{i}' for i in range(96)]
    columns.insert(0, 'category')
    new_df = pd.DataFrame(columns=columns)
    for idx in tqdm(range(df.shape[0])):
        text = df.TITLE.iloc[idx].replace('...', '')
        vec = list(nlp(text).vector)
        vec.insert(0, df.CATEGORY.iloc[idx])
        new_df = pd.concat([new_df, pd.DataFrame(np.array(vec).reshape(1,-1), columns=columns)], axis=0)
    return new_df

train_feature = output_feature(train_df)
valid_feature = output_feature(valid_df)
test_feature = output_feature(test_df)

train_feature.to_csv('train.feature.txt', sep='\t', index=False, header=False)
valid_feature.to_csv('valid.feature.txt', sep='\t', index=False, header=False)
test_feature.to_csv('test.feature.txt', sep='\t', index=False, header=False)

100%|██████████| 1334/1334 [00:09<00:00, 140.17it/s]
100%|██████████| 1334/1334 [00:09<00:00, 144.06it/s]


In [149]:
# 学習
lr = LogisticRegression(random_state=0, max_iter=10000)
lr.fit(X_train, y_train)

In [150]:
def predict(X):
    pred = lr.predict(X)
    proba = np.max(lr.predict_proba(X), axis=1)
    return pred, proba

train_pred, train_proba = predict(X_train)
test_pred, test_proba = predict(X_test)

for i in range(10):
    print(test_pred[i], test_proba[i])

b 0.9126391579786374
e 0.8572112601541371
e 0.5614529778737961
e 0.6752785290102353
b 0.5618398158946027
e 0.6450913101106117
b 0.942423972545519
b 0.8456812246921981
b 0.5996355749233481
b 0.8294478750675441


In [151]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

print(f'train acc: {accuracy_score(y_train, train_pred)}')
print(f'test acc: {accuracy_score(y_test, test_pred)}')

train acc: 0.9404047976011994
test acc: 0.8778110944527736


In [152]:
print(f'train confusion matrix:\n {confusion_matrix(y_train, train_pred)}')
print(f'test confusion matrix:\n {confusion_matrix(y_test, test_pred)}')

train confusion matrix:
 [[4409   45    4   23]
 [  23 4214    1    2]
 [ 100  141  494    2]
 [ 183  111    1  919]]
test confusion matrix:
 [[539  24   1   7]
 [  9 499   0   3]
 [ 19  19  40   1]
 [ 46  33   1  93]]
