In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
    
# import dataset
df = pd.read_csv("./wikipedia_300/wikipedia_300.csv")

# test_size default is 25% (0.25)
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Category'], random_state = 0)

#1. Naive Bayes with no term frequency or inverse document frequency adjustments

# Extract word counts as features and vectorize

from sklearn.feature_extraction.text import CountVectorizer
# min_df=1 means ignore words with word count less than one
count_vect = CountVectorizer(min_df=1)
X_train_counts = count_vect.fit_transform(X_train)

clf = MultinomialNB().fit(X_train_counts, y_train)
clf.score(X_train_counts, y_train)
#CountVectorizer?
#print(vectorizer.get_feature_names())

#print(X_train_counts.toarray())  # doctest: +NORMALIZE_WHITESPACE



0.9866666666666667

In [30]:
#2. Naive Bayes with term frequency and inverse document frequency adjustments

X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Category'], random_state = 0)

X_train_counts = count_vect.fit_transform(X_train)
print(X_train_counts.shape)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

clf_tfidf = MultinomialNB().fit(X_train_tfidf, y_train)
clf_tfidf.score(X_train_tfidf, y_train)


(225, 43017)
(225, 43017)


0.9911111111111112

In [38]:
#3. Making a pipeline to make things easier to work with
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
])

text_clf.fit(X_train, y_train) 

import numpy as np
#twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = X_test
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test)   
#docs_test = X_train
#predicted = text_clf.predict(docs_test)
#np.mean(predicted == y_train)   


0.9466666666666667

In [40]:

print(clf.predict(count_vect.transform([df['Text'][150]])))
#print([df['Text'][1]])
print(df['Category'][150])
#print(y_test)
# X_test: 208, 188, 12 (Programming)
print(clf.predict(count_vect.transform([df['Text'][188]])))
print(df['Category'][188])

['Games']
Games
['Games']
Programming


In [53]:
#4. LinearSVC with no tf.idf


X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Category'], random_state = 0)

X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)
from sklearn.svm import LinearSVC

LinearClf = LinearSVC(random_state=0, tol=1e-5)
LinearClf.fit(X_train_counts, y_train)

#print(LinearClf.coef_)
#print(LinearClf.intercept_)
print(LinearClf.predict(count_vect.transform([df['Text'][188]])))
#print(LinearClf.predict(vectorizer.transform([df['Text'][0]])))
#print(count_vect.transform([df['Text'][188]]))
LinearClf.score(X_train_counts, y_train)
LinearClf.score(X_test_counts, y_test)



['Programming']


0.9066666666666666

In [74]:
# also evaluate accuracy using 10-fold cross validation
# Here I should send in ALL values, not just train or test??
#KFold?
from sklearn.model_selection import KFold, cross_val_score
k_fold = KFold(n_splits=10, shuffle=False, random_state=None)
#clf = <any classifier>
X_all_counts = count_vect.transform(df['Text'])

print(cross_val_score(LinearClf, X_all_counts, df['Category'], cv=k_fold, n_jobs=1))
#print(LinearClf.score(X_all_counts, df['Category']))



[0.93333333 0.93333333 0.93333333 0.86666667 0.86666667 0.93333333
 0.86666667 0.96666667 0.9        0.93333333]


In [72]:
#4. LinearSVC with tf.idf

X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Category'], random_state = 0)

X_train_counts = count_vect.fit_transform(X_train)

from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

LinearClf2 = LinearSVC(random_state=0, tol=1e-5)
LinearClf2 = LinearSVC().fit(X_train_tfidf, y_train)
LinearClf2.score(X_train_tfidf, y_train)

k_fold = KFold(n_splits=10, shuffle=False, random_state=None)
#clf = <any classifier>

X_all_counts = count_vect.transform(df['Text'])

print(cross_val_score(LinearClf2, X_all_counts, df['Category'], cv=k_fold, n_jobs=1))
# I should get different scores here, because I'm using TF.IDF
# The model LinearClf2 should be different than the earlier

print(LinearClf2.score(X_all_counts, df['Category']))


[0.93333333 0.93333333 0.93333333 0.86666667 0.86666667 0.93333333
 0.86666667 0.96666667 0.9        0.93333333]
0.9766666666666667
