In [None]:
# import graphs libraries
from ggplot import *
import matplotlib.pyplot as plt
%matplotlib inline
# import data and ROC function libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import roc_curve
# import matrix transform library
from sklearn.feature_extraction.text import HashingVectorizer
# import all classifier classifier algorithms library
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pandas as pd

In [None]:
# import all data and split data into training and validation samples (40% validation + 60% traning)
data_train = fetch_20newsgroups(subset='train', #categories=categories,
        shuffle=True, random_state=42)
data_val = fetch_20newsgroups(subset='test', #categories=categories,
        shuffle=True, random_state=42)
dim_train=len(data_train.target) # sample training size
dim_val=len(data_val.target) # sample validation size
plt.figure(figsize=[5,5])
plt.hist(data_train.target, bins=[1,2,3,4,5,6,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    ,facecolor='grey',alpha=0.5) # plot by category the training set
plt.title("Traing Set")
plt.xlabel("Categories")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Transform the dataset in matrix
vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=10000)
X_train = vectorizer.fit_transform(data_train.data)
X_val = vectorizer.transform(data_val.data)
y_train = data_train.target==0
y_val = data_val.target==0

# Running the Naive Bayes Model
clf = MultinomialNB()
clf.fit(X_train.todense(), y_train) # model dense matrix for sparse matrix

# Calculating ROC Curve
probs = clf.predict_proba(X_val.todense())[:,1]
FalsePositive, TruePositive, thresh = roc_curve(y_val, probs)
name="TodaBase"
results = pd.DataFrame({
        "nome": name,
        "FalsePositive": FalsePositive,
        "TruePositive": TruePositive
    })

ggplot(aes(x='FalsePositive', y='TruePositive', color='name'), data=results) + \
    geom_step(size=3) + \
    geom_abline(color="black") + \
    ggtitle("Text Classification: ROC 1")
    

In [None]:
# Select similar categories
categories = [
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x'
]
# import all data and split data into training and validation samples (40% validation + 60% traning)
data_train2 = fetch_20newsgroups(subset='train', categories=categories,
        shuffle=True, random_state=42)
data_val2 = fetch_20newsgroups(subset='test', categories=categories,
        shuffle=True, random_state=42)
dim_train2=len(data_train.target) # sample training size
dim_val2=len(data_val.target) # sample validation size

# Transform the dataset in matrix
vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=10000)
X_train2 = vectorizer.fit_transform(data_train2.data)
X_val2 = vectorizer.transform(data_val2.data)
y_train2 = data_train2.target==0
y_val2 = data_val2.target==0

# Running the Naive Bayes Model
clf = MultinomialNB()
clf.fit(X_train2.todense(), y_train2) # model dense matrix for sparse matrix

# Calculating ROC Curve
probs2 = clf.predict_proba(X_val2.todense())[:,1]
FalsePositive, TruePositive, thresh = roc_curve(y_val2, probs2)
name="SimilarCat"
results2 = pd.DataFrame({
        "name": name,
        "FalsePositive": FalsePositive,
        "TruePositive": TruePositive
    })
new_results=results
new_results=new_results.append(results2)
ggplot(aes(x='FalsePositive', y='TruePositive', color='name'), data=new_results) + \
    geom_step(size=3) + \
    geom_abline(color="black") + \
    ggtitle("Text Classification: ROC 2")


In [None]:
# Select different categories
categories = [
'rec.motorcycles',
    'sci.space'
]
# import all data and split data into training and validation samples (40% validation + 60% traning)
data_train3 = fetch_20newsgroups(subset='train', categories=categories,
        shuffle=True, random_state=42)
data_val3 = fetch_20newsgroups(subset='test', categories=categories,
        shuffle=True, random_state=42)
dim_train3=len(data_train.target) # sample training size
dim_val3=len(data_val.target) # sample validation size

# Transform the dataset in matrix
vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=10000)
X_train3 = vectorizer.fit_transform(data_train3.data)
X_val3 = vectorizer.transform(data_val3.data)
y_train3 = data_train3.target==0
y_val3 = data_val3.target==0

# Running the Naive Bayes Model
clf = MultinomialNB()
clf.fit(X_train3.todense(), y_train3) # model dense matrix for sparse matrix

# Calculating ROC Curve
probs3 = clf.predict_proba(X_val3.todense())[:,1]
FalsePositive, TruePositive, thresh = roc_curve(y_val3, probs3)
name="DiffCat"
results3 = pd.DataFrame({
        "name": name,
        "FalsePositive": FalsePositive,
        "TruePositive": TruePositive
    })
new_results=new_results.append(results3)
ggplot(aes(x='FalsePositive', y='TruePositive', color='name'), data=new_results) + \
    geom_step(size=3) + \
    geom_abline(color="black") + \
    ggtitle("Text Classification: ROC 3")


In [None]:
# Models Name
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space'
]
# import all data and split data into training and validation samples (40% validation + 60% traning)
data_train4 = fetch_20newsgroups(subset='train', categories=categories,
        shuffle=True, random_state=42)
data_val4 = fetch_20newsgroups(subset='test', categories=categories,
        shuffle=True, random_state=42)
dim_train4=len(data_train.target) # sample training size
dim_val4=len(data_val.target) # sample validation size

# Transform the dataset in matrix
vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=10000)
X_train4 = vectorizer.fit_transform(data_train4.data)
X_val4 = vectorizer.transform(data_val4.data)
y_train4 = data_train4.target==0
y_val4 = data_val4.target==0

# Diferent Models
clfs = [
    ("MultinomialNB", MultinomialNB()),
    ("KNeighborsClassifier", KNeighborsClassifier()),
    ("RandomForestClassifier", RandomForestClassifier()),
    ("SVM", SVC(probability=True))
]

# Loop to calculate the model and ROC
all_results = None
for name, clf in clfs:
    clf.fit(X_train4.todense(), y_train4)
    probs = clf.predict_proba(X_val4.todense())[:,1]
    FalsePositive, TruePositive, thresh = roc_curve(y_val4, probs)
    results4 = pd.DataFrame({
        "name": name,
        "FalsePositive": FalsePositive,
        "TruePositive": TruePositive
    })
    if all_results is None:
        all_results = results4
    else:
        all_results = all_results.append(results4)

In [None]:
plot=ggplot(aes(x='FalsePositive', y='TruePositive', color='name'), data=all_results) + \
geom_step(size=3) + \
geom_abline(color="black") + \
ggtitle("Text Classification: ROC 4 \\ green=NB|purple=RF|red=KN|blue=SVM")
print plot