# NY Times Topic Classification Project

Import libraries

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk.data
import numpy as np

Fetch data

In [2]:
categories = ['comp.graphics', 'rec.sport.baseball', 'sci.med', 'talk.politics.misc']
data = fetch_20newsgroups(subset='train', categories=categories).data
newsgroups = fetch_20newsgroups(subset='train', categories=categories)

Explore data

In [3]:
type(data), len(data)

(list, 2240)

In [4]:
data[0]

'From: geb@cs.pitt.edu (Gordon Banks)\nSubject: Re: Name of MD\'s eyepiece?\nReply-To: geb@cs.pitt.edu (Gordon Banks)\nOrganization: Univ. of Pittsburgh Computer Science\nLines: 13\n\nIn article <C4IHM2.Gs9@watson.ibm.com> clarke@watson.ibm.com (Ed Clarke) writes:\n>|> |It\'s not an eyepiece.  It is called a head mirror.  All doctors never\n>\n>A speculum?\n\nThe speculum is the little cone that fits on the end of the otoscope.\nThere are also vaginal specula that females and gynecologists are\nall too familiar with.\n-- \n----------------------------------------------------------------------------\nGordon Banks  N3JXP      | "Skepticism is the chastity of the intellect, and\ngeb@cadre.dsl.pitt.edu   |  it is shameful to surrender it too soon." \n----------------------------------------------------------------------------\n'

Import stopwords

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from string import punctuation

Import Stemmer and Lemmatizer

In [6]:
from  nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet  import WordNetLemmatizer

Vectorize text

In [7]:
vectorizer  = TfidfVectorizer(stop_words='english', max_features=2000)
vectors = vectorizer.fit_transform(data).toarray()
words = vectorizer.get_feature_names()

In [8]:
vectors.shape

(2240, 2000)

Implement function to get top features

In [9]:
def get_top_values(lst, n, labels):
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]

Define another vectorizer without using idf

In [10]:
vectorizer2 = TfidfVectorizer(use_idf=False, max_features=2000)
vectors2 = vectorizer2.fit_transform(["\n".join(data)]).toarray()
print("top 10 by tf across all corpus")
print(get_top_values(vectors2[0], 10, words))

top 10 by tf across all corpus
['talking', 'thf2', 'ny', 'american', 'ii', 'int', 'talk', 'interesting', 'feel', 'duke']


Load data

In [11]:
data = fetch_20newsgroups(subset='train', categories=categories).data
target = fetch_20newsgroups(subset='train', categories=categories).target

In [13]:
len(data), type(data), target.shape

(2240, list, (2240,))

Identify features and targets

In [14]:
X = vectors
y = target

Transform data

In [15]:
test_data = fetch_20newsgroups(subset='test', categories=categories).data
test_target = fetch_20newsgroups(subset='test', categories=categories).target

In [16]:
y_test = test_target
X_test = vectorizer.transform(test_data)

Build Logistic Regression classifier

In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
model.score(X, y)

0.9861607142857143

In [19]:
model.coef_.shape

(4, 2000)

Get the top features

In [20]:
num_category = 0

print(categories[num_category])

get_top_values(model.coef_[num_category], 10, words)

comp.graphics


['graphics',
 'image',
 '3d',
 'files',
 'images',
 'file',
 '3do',
 'windows',
 'points',
 'software']

In [21]:
model.score(X, y)

0.9861607142857143

In [22]:
model.score(X_test, y_test)

0.925603217158177

Naive Bayes Classifier

In [23]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
model.score(X, y)

0.9714285714285714

In [25]:
model.score(X_test, y_test)

0.9175603217158177

Randorm Forest Classifier

In [26]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(n_estimators=200,max_depth=50,min_samples_leaf=3, random_state=1)
model = OneVsRestClassifier(estimator,n_jobs=-1)

model.fit(X, y)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
          n_jobs=-1)

In [27]:
model.score(X, y)

0.9924107142857143

In [28]:
model.score(X_test, y_test)

0.925603217158177

Gradient Boosted Trees

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,max_depth=5,min_samples_leaf=2, random_state=1)
model = OneVsRestClassifier(estimator,n_jobs=-1)

model.fit(X, y)

OneVsRestClassifier(estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=2, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=1, subsample=1.0, verbose=0,
              warm_start=False),
          n_jobs=-1)

In [30]:
model.score(X, y)

0.9986607142857142

In [31]:
model.score(X_test, y_test)

0.8806970509383378