# CSCI-UA 0473 - Introduction to Machine Learning
## Wednesday, March 1, 2017

In [1]:
import numpy
from pprint import pprint as pp
from sklearn import datasets
import pandas as pd

## 1. Data

In [2]:
newsgroups_train = datasets.fetch_20newsgroups(subset='train', 
                                               categories=['comp.sys.mac.hardware', 'rec.motorcycles', 'sci.med', 'soc.religion.christian'])

In [3]:
newsgroups_test = datasets.fetch_20newsgroups(subset='test', 
                                               categories=['comp.sys.mac.hardware', 'rec.motorcycles', 'sci.med', 'soc.religion.christian'])

In [4]:
classes = ['comp.sys.mac.hardware', 'rec.motorcycles', 'sci.med', 'soc.religion.christian']

## 2. Preprocessing

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [6]:
count_vectorizer = CountVectorizer(min_df=30)
count_vectors_train = count_vectorizer.fit_transform(newsgroups_train.data)
count_vectors_test = count_vectorizer.transform(newsgroups_test.data)

In [7]:
tfidf_vectorizer = TfidfVectorizer(min_df=30)
tfidf_vectors_train = tfidf_vectorizer.fit_transform(newsgroups_train.data)
tfidf_vectors_test = tfidf_vectorizer.transform(newsgroups_test.data)

In [5]:
# How does the vocabulary look?

# pp(vectorizer.vocabulary_)

## 3. Modeling

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [9]:
lr.fit(count_vectors_train, newsgroups_train.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
y_train = lr.predict(count_vectors_train)
y_test = lr.predict(count_vectors_test)

In [11]:
print ("Training Error: ", float(numpy.sum(abs(y_train - newsgroups_train.target)))/len(newsgroups_train.target))

('Training Error: ', 0.0)


In [12]:
print ("Test Error: ", float(numpy.sum(abs(y_test - newsgroups_test.target)))/len(newsgroups_test.target))

('Test Error: ', 0.1293595434369055)


## 4. Analysis

In [13]:
invocab = dict([(vv,kk) for kk, vv in count_vectorizer.vocabulary_.items()])

In [14]:
insens_idx = numpy.argsort(abs(lr.coef_[0:].squeeze()))
ordered_idx = numpy.argsort(lr.coef_[0:].squeeze())

In [17]:
def class_analysis(ordering_object, sensitivity, class_name = 0):

    if sensitivity:
        print 'Sensitive to:'
    else:
        print ('Favours ' + classes[class_name])
    print pd.DataFrame({'feature': pd.Series([invocab[f] for f in ordering_object[class_name][-10:]]), 
                        'coeff': lr.coef_[class_name, ordering_object[class_name][-10:]]}, columns = ['feature', 'coeff'])

    if not sensitivity:
        return
    print

    print 'Insensitive to:'
    print pd.DataFrame({'feature': pd.Series([invocab[f] for f in ordering_object[class_name][:10]]), 
                        'coeff': lr.coef_[class_name, ordering_object[class_name][:10]]}, columns = ['feature', 'coeff'])
    print

In [18]:
class_analysis(ordered_idx, sensitivity = False, class_name= 2)

Favours sci.med
       feature     coeff
0    sometimes  0.687912
1       cancer  0.802050
2       health  0.808605
3    treatment  0.851945
4         pitt  0.887686
5          msg  0.925084
6      medical  0.934476
7      disease  1.061232
8       doctor  1.324708
9  information  1.330946
