# News classification

## Imports

In [1]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

import joblib

## Data loading & processing

In [2]:
train_data = datasets.fetch_20newsgroups(subset='train')

In [3]:
test_data = datasets.fetch_20newsgroups(subset='test')

In [4]:
test_data.data[0]

'From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)\nSubject: Need info on 88-89 Bonneville\nOrganization: University at Buffalo\nLines: 10\nNews-Software: VAX/VMS VNEWS 1.41\nNntp-Posting-Host: ubvmsd.cc.buffalo.edu\n\n\n I am a little confused on all of the models of the 88-89 bonnevilles.\nI have heard of the LE SE LSE SSE SSEI. Could someone tell me the\ndifferences are far as features or performance. I am also curious to\nknow what the book value is for prefereably the 89 model. And how much\nless than book value can you usually get them for. In other words how\nmuch are they in demand this time of year. I have heard that the mid-spring\nearly summer is the best time to buy.\n\n\t\t\tNeil Gandler\n'

In [5]:
vectorizer = CountVectorizer(min_df=100)

vectorized_train_data = vectorizer.fit_transform(train_data.data)
vectorized_test_data = vectorizer.transform(test_data.data)

In [6]:
test_data.target

array([ 7,  5,  0, ...,  9,  6, 15])

In [7]:
test_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Model training

In [8]:
model = RandomForestClassifier(n_estimators=100)
model.fit(vectorized_train_data, train_data.target)

RandomForestClassifier()

In [9]:
train_preds = model.predict(vectorized_train_data)
test_preds = model.predict(vectorized_test_data)

In [10]:
accuracy_score(train_data.target, train_preds)

0.9999116139296447

In [11]:
accuracy_score(test_data.target, test_preds)

0.7067180031864047

In [12]:
test_obj = 'I heard that python programming language now is very popular technology for scientific projects'
vectorized_test_obj = vectorizer.transform([test_obj])

In [13]:
 model.predict(vectorized_test_obj)[0]

1

In [14]:
label =  model.predict(vectorized_test_obj)[0]
test_data.target_names[label]

'comp.graphics'

## Dump vectorizer & model

In [15]:
with open('news_vectorizer_dump.pkl', 'wb') as output_file:
    joblib.dump(vectorizer, output_file)

with open('news_model_dump.pkl', 'wb') as output_file:
    joblib.dump(model, output_file)