# NY Times Topic Classification Project

Import libraries

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk.data
import numpy as np

Fetch data

In [2]:
categories = ['comp.graphics', 'rec.sport.baseball', 'sci.med', 'talk.politics.misc']
data = fetch_20newsgroups(subset='train', categories=categories).data
newsgroups = fetch_20newsgroups(subset='train', categories=categories)

Explore data

In [3]:
type(data), len(data)

(list, 2240)

In [4]:
data[0]

'From: geb@cs.pitt.edu (Gordon Banks)\nSubject: Re: Name of MD\'s eyepiece?\nReply-To: geb@cs.pitt.edu (Gordon Banks)\nOrganization: Univ. of Pittsburgh Computer Science\nLines: 13\n\nIn article <C4IHM2.Gs9@watson.ibm.com> clarke@watson.ibm.com (Ed Clarke) writes:\n>|> |It\'s not an eyepiece.  It is called a head mirror.  All doctors never\n>\n>A speculum?\n\nThe speculum is the little cone that fits on the end of the otoscope.\nThere are also vaginal specula that females and gynecologists are\nall too familiar with.\n-- \n----------------------------------------------------------------------------\nGordon Banks  N3JXP      | "Skepticism is the chastity of the intellect, and\ngeb@cadre.dsl.pitt.edu   |  it is shameful to surrender it too soon." \n----------------------------------------------------------------------------\n'

Import stopwords

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from string import punctuation

Import Stemmer and Lemmatizer

In [6]:
from  nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet  import WordNetLemmatizer

Vectorize text

In [7]:
vectorizer  = TfidfVectorizer(stop_words='english', max_features=2000)
vectors = vectorizer.fit_transform(data).toarray()
words = vectorizer.get_feature_names()

In [8]:
vectors.shape

(2240, 2000)

Implement function to get top features

In [9]:
def get_top_values(lst, n, labels):
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]

Define another vectorizer without using idf

In [10]:
vectorizer2 = TfidfVectorizer(use_idf=False, max_features=2000)
vectors2 = vectorizer2.fit_transform(["\n".join(data)]).toarray()
print("top 10 by tf across all corpus")
print(get_top_values(vectors2[0], 10, words))

top 10 by tf across all corpus
['talking', 'thf2', 'ny', 'american', 'ii', 'int', 'talk', 'interesting', 'feel', 'duke']


Load data

In [11]:
data = fetch_20newsgroups(subset='train', categories=categories).data
target = fetch_20newsgroups(subset='train', categories=categories).target

In [12]:
len(data), type(data), target.shape

(2240, list, (2240,))

Identify features and targets

In [13]:
X = vectors
y = target

Transform data

In [14]:
test_data = fetch_20newsgroups(subset='test', categories=categories).data
test_target = fetch_20newsgroups(subset='test', categories=categories).target

In [15]:
y_test = test_target
X_test = vectorizer.transform(test_data)

Build Logistic Regression classifier

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X, y)

print('Training Accuracy: ', model.score(X, y))
print('Testing Accuracy: ', model.score(X_test, y_test))

Training Accuracy:  0.9861607142857143
Testing Accuracy:  0.925603217158177


Get the top features

In [17]:
num_category = 0

print(categories[num_category])

get_top_values(model.coef_[num_category], 10, words)

comp.graphics


['graphics',
 'image',
 '3d',
 'files',
 'images',
 'file',
 '3do',
 'windows',
 'points',
 'software']

Naive Bayes Classifier

In [20]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X, y)

print('Training Accuracy: ', model.score(X, y))
print('Testing Accuracy: ', model.score(X_test, y_test))

Training Accuracy:  0.9714285714285714
Testing Accuracy:  0.9175603217158177


Randorm Forest Classifier

In [21]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(n_estimators=200,max_depth=50,min_samples_leaf=3, random_state=1)
model = OneVsRestClassifier(estimator,n_jobs=-1)

model.fit(X, y)

print('Training Accuracy: ', model.score(X, y))
print('Testing Accuracy: ', model.score(X_test, y_test))

Training Accuracy:  0.9924107142857143
Testing Accuracy:  0.925603217158177


Gradient Boosted Trees

In [22]:
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,max_depth=5,min_samples_leaf=2, random_state=1)
model = OneVsRestClassifier(estimator,n_jobs=-1)

model.fit(X, y)

print('Training Accuracy: ', model.score(X, y))
print('Testing Accuracy: ', model.score(X_test, y_test))

Training Accuracy:  0.9986607142857142
Testing Accuracy:  0.8806970509383378
