In [3]:
from sklearn.datasets import fetch_20newsgroups, fetch_mldata
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from numpy import linalg as LA
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
import numpy as np
import scipy

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train', data_home='../data/')
newsgroups_test = fetch_20newsgroups(subset='test', data_home='../data/')

vectortype_train = TfidfVectorizer(stop_words='english')
vectortype_test = TfidfVectorizer(stop_words='english')

news_vectored_result_train = vectortype_train.fit_transform(newsgroups_train.data)
news_vectored_result_test = vectortype_test.fit_transform(newsgroups_test.data)

news_train_target = newsgroups_train.target
news_train_target_names = newsgroups_train.target_names
news_test_target = newsgroups_test.target


train_features = vectortype_train.get_feature_names()
test_features = vectortype_test.get_feature_names()
common_features = np.intersect1d(train_features, test_features)

train_feature_final = np.searchsorted(train_features, common_features)
news_vt_train = news_vectored_result_train[:,train_feature_final]

test_feature_final = np.searchsorted(test_features, common_features)
news_vt_test = news_vectored_result_test[:,test_feature_final]

In [11]:
def calc_lrdt(train, test, train_target, test_target):
    lr = LogisticRegression()
    lr.fit(train, train_target)
    np.fliplr(np.argsort(np.absolute(lr.coef_)))
    predicted_lr = lr.predict(test)
    print('Accuaracy Score LR : ', accuracy_score(test_target, predicted_lr))
    
    dt = DecisionTreeClassifier()
    dt.fit(train, train_target)
    np.flip(np.argsort(np.absolute(dt.feature_importances_)), axis=0)
    predicted_dt = dt.predict(test)
    print('Accuaracy Score DT : ', accuracy_score(test_target, predicted_dt))
    del lr, dt

In [12]:
select_chi2 = SelectKBest(chi2, 200)
train_chi2 = select_chi2.fit_transform(news_vt_train, news_train_target)
test_chi2 = select_chi2.transform(news_vt_test)
calc_lrdt(train_chi2, test_chi2, news_train_target, news_test_target)

Accuaracy Score LR :  0.611391396707
Accuaracy Score DT :  0.542219861922


In [13]:
select_mutual = SelectKBest(mutual_info_classif, 200)
train_mutual = select_mutual.fit_transform(news_vt_train, news_train_target)
test_mutual = select_mutual.transform(news_vt_test)
calc_lrdt(train_mutual, test_mutual, news_train_target, news_test_target)

Accuaracy Score LR :  0.336298459904
Accuaracy Score DT :  0.232076473712
