In [1]:
import os
import random
import pandas as pd
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np

In [2]:
target_names = os.listdir('train')

In [3]:
def read_data(folder_name):
    row_data = []
    target_label = []
    for f_outter in os.listdir(folder_name):
        outter_name = folder_name + "/" + f_outter
        for f_inner in os.listdir(outter_name):
            inner_name = outter_name + "/" + f_inner
            file_to_read = open(inner_name)
            text = file_to_read.read()
            row_data.append(text)
            target_label.append(f_outter)
    data = pd.DataFrame(
        {'text_data': row_data,
         'target_label': target_label})
    return data

In [4]:
train_data = shuffle(read_data('train'))
train_data.head()

Unnamed: 0,text_data,target_label
11158,From: pharvey@quack.kfu.com (Paul Harvey)\nSub...,talk.religion.misc
4514,From: firman@envmsa.eas.asu.edu (B B S)\nSubje...,rec.autos
10955,From: bobsarv@microsoft.com (Bob Sarver)\nSubj...,talk.religion.misc
4702,From: howp@skyfox\nSubject: Thoughts on a 1982...,rec.motorcycles
3706,From: HO@kcgl1.eng.ohio-state.edu (Francis Ho)...,misc.forsale


In [5]:
x_train = train_data['text_data']
y_train = train_data['target_label']

In [6]:
print("\n".join(train_data['text_data'][0].split("\n")[:5]))

From: mathew <mathew@mantis.co.uk>
Subject: Alt.Atheism FAQ: Atheist Resources
Summary: Books, addresses, music -- anything related to atheism
Keywords: FAQ, atheism, books, music, fiction, addresses, contacts
Expires: Thu, 29 Apr 1993 11:57:19 GMT


In [7]:
test_data = shuffle(read_data('test'))
test_data.head()

Unnamed: 0,text_data,target_label
7165,From: koresh@dead.davidian\nSubject: Gun Contr...,talk.politics.misc
1055,From: kepley@photon.phys.unca.edu (Brad Kepley...,comp.os.ms-windows.misc
3072,From: randy@megatek.com (Randy Davis)\nSubject...,rec.motorcycles
2555,From: goldsman@cc.gatech.edu (Michael G. Golds...,misc.forsale
5468,From: PHARABOD@FRCPN11.IN2P3.FR\nSubject: Fran...,sci.space


In [8]:
x_test = test_data['text_data']
y_test = test_data['target_label']

In [9]:
print("\n".join(test_data['text_data'][0].split("\n")[:5]))

From: decay@cbnewsj.cb.att.com (dean.kaflowitz)
Subject: Re: about the bible quiz answers
Organization: AT&T
Distribution: na
Lines: 18


In [10]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(x_train)
X_train_counts.shape

(11314, 130107)

In [11]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [12]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [13]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(train_data['text_data'], train_data['target_label'])

In [14]:
predicted = text_clf.predict(x_test)
np.mean(predicted == y_test)

0.7738980350504514

In [15]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(x_train, y_train)
predicted_svm = text_clf_svm.predict(x_test)
np.mean(predicted_svm == y_test)



0.82434944237918217

In [16]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [18]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(x_train, y_train)

In [19]:
print(gs_clf.best_score_)
gs_clf.best_params_

0.909669436097


{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}