## 第6章: 機械学習

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv('./NewsAggregatorDataset/newsCorpora.csv', delimiter='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [10]:
target_df = df.query('PUBLISHER == "Reuters" or PUBLISHER == "Huffington Post" or PUBLISHER == "Businessweek" or PUBLISHER == "Contactmusic.com" or PUBLISHER == "Daily Mail"')
target_df['PUBLISHER'].value_counts()

PUBLISHER
Reuters             3902
Huffington Post     2455
Businessweek        2395
Contactmusic.com    2334
Daily Mail          2254
Name: count, dtype: int64

In [38]:
train_df, valid_df = train_test_split(target_df, test_size=0.2, random_state=0)
valid_df, test_df = train_test_split(valid_df, test_size=0.5, random_state=0)
print(len(train_df), len(valid_df), len(test_df))

train_df[['CATEGORY', 'TITLE']].to_csv('sample.txt', sep='\t', index=False, header=False)
valid_df[['CATEGORY', 'TITLE']].to_csv('valid.txt', sep='\t', index=False, header=False)
test_df[['CATEGORY', 'TITLE']].to_csv('test.txt', sep='\t', index=False, header=False)

10672 1334 1334


In [39]:
print(train_df['CATEGORY'].value_counts())
print(valid_df['CATEGORY'].value_counts())
print(test_df['CATEGORY'].value_counts())

CATEGORY
b    4481
e    4240
t    1214
m     737
Name: count, dtype: int64
CATEGORY
b    575
e    528
t    137
m     94
Name: count, dtype: int64
CATEGORY
b    571
e    511
t    173
m     79
Name: count, dtype: int64


In [68]:
# !pip install spacy
import spacy
from tqdm import tqdm

nlp = spacy.load('en_core_web_sm')

def output_feature(df):
    columns = [f'feature_{i}' for i in range(96)]
    columns.insert(0, 'category')
    new_df = pd.DataFrame(columns=columns)
    for idx in tqdm(range(df.shape[0])):
        text = df.TITLE.iloc[idx].replace('...', '')
        vec = list(nlp(text).vector)
        vec.insert(0, df.CATEGORY.iloc[idx])
        new_df = pd.concat([new_df, pd.DataFrame(np.array(vec).reshape(1,-1), columns=columns)], axis=0)
    return new_df

train_feature = output_feature(train_df)
valid_feature = output_feature(valid_df)
test_feature = output_feature(test_df)

train_feature.to_csv('train.feature.txt', sep='\t', index=False, header=False)
valid_feature.to_csv('valid.feature.txt', sep='\t', index=False, header=False)
test_feature.to_csv('test.feature.txt', sep='\t', index=False, header=False)

100%|██████████| 1334/1334 [00:09<00:00, 140.17it/s]
100%|██████████| 1334/1334 [00:09<00:00, 144.06it/s]


In [91]:
from sklearn.linear_model import LogisticRegression

X_train = train_feature.drop('category', axis=1)
y_train = train_feature.category

X_valid = valid_feature.drop('category', axis=1)
y_valid = valid_feature.category

X_test = test_feature.drop('category', axis=1)
y_test = test_feature.category

In [92]:
lr = LogisticRegression(random_state=0, max_iter=10000)
lr.fit(X_train, y_train)

In [95]:
def predict(X):
    pred = lr.predict(X)
    proba = np.max(lr.predict_proba(X), axis=1)
    return pred, proba

train_pred, train_proba = predict(X_train)
test_pred, test_proba = predict(X_test)

for i in range(len(test_pred)):
    print(test_pred[i], test_proba[i])

e 0.5769137234500876
e 0.7358313734293878
e 0.8434907475800886
b 0.4677121803559529
b 0.6392582521912762
e 0.9055836495620512
b 0.9245269070303335
b 0.8404045863977334
e 0.3643321838765244
b 0.8606217910786843
b 0.502061898759152
e 0.8759688826778592
e 0.6159599236637032
b 0.8472495431436099
b 0.9176054113396452
e 0.7279974164389038
e 0.8254797856332563
b 0.9326510270752131
b 0.9267909729922403
b 0.5686710371352786
t 0.3631173444946992
b 0.817287047894054
b 0.7833970681655679
e 0.4705103166813039
e 0.4128901795256462
b 0.9042828784945757
e 0.45368527127325287
b 0.6640390491386693
e 0.9427178175803469
b 0.6811294692488435
b 0.5258018978755654
b 0.8305620470550056
e 0.5522624805878837
e 0.811308454660211
b 0.8082013294961735
t 0.44429021753500786
e 0.807860432650162
e 0.7803833412369298
e 0.5848163666170738
e 0.5275707191157908
e 0.8690683891044753
e 0.4692809057022485
b 0.9070630264923467
b 0.7398927277999876
e 0.938042141401128
e 0.7047530763021616
e 0.6406631531326065
b 0.958116504957

In [97]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

print(f'train acc: {accuracy_score(y_train, train_pred)}')
print(f'test acc: {accuracy_score(y_test, test_pred)}')

train acc: 0.7069902548725637
test acc: 0.7038980509745127


In [98]:
print(f'train confusion matrix:\n {confusion_matrix(y_train, train_pred)}')
print(f'test confusion matrix:\n {confusion_matrix(y_test, test_pred)}')

train confusion matrix:
 [[3790  585   46   60]
 [ 570 3580   29   61]
 [ 386  241   72   38]
 [ 626  448   37  103]]
test confusion matrix:
 [[474  81   3  13]
 [ 55 445   4   7]
 [ 45  24   6   4]
 [ 99  56   4  14]]
