<a href="https://colab.research.google.com/github/mdkamrulhasan/machine_learning_concepts/blob/master/notebooks/supervised/Classification_documents_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

What will we cover today ?


1.  Document Classification

 *   Principal Component Analysis
 *   Neural Network Introduction


In [1]:
import numpy as np
import pandas as pd
# Models (Sklearn)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Data and Evaluation packages
from sklearn import datasets
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
# visualization
import plotly.express as px
from sklearn.model_selection import train_test_split

# Our Wrapper Class (Can take any model as an input)

In [2]:
from sklearn.model_selection import cross_val_score

class myClassificationModel:
  def __init__(self, model):
    self.model = model
    self.nb_cv_splits = 3
    self.evaluation_metrics = 'accuracy' #'f1'

  def train(self, X, y):
    self.model.fit(X, y)

  def evaluate(self, X, y):
    y_predict = self.model.predict(X)
    # return mean_squared_error(y, y_predict)
    return accuracy_score(y, y_predict)

  def cv_error(self, X, y):
    return cross_val_score(self.model,
                           X,
                           y, scoring=self.evaluation_metrics,
                           cv=self.nb_cv_splits)



[Data description](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

In [3]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [4]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [5]:
len(twenty_train.data), len(twenty_train.filenames)

(2257, 2257)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1, 1))
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [7]:
X_train_counts[0]

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 73 stored elements and shape (1, 35788)>

In [8]:
X = X_train_counts
y = twenty_train.target

In [9]:
np.sum(X[0])

np.int64(98)

Highest frequency tokens

In [10]:
query_record = 0# 100
x_0_dense = X[query_record].todense()

max_freq_index = x_0_dense.argmax()
max_freq = x_0_dense[0, x_0_dense.argmax()]

for key in count_vect.vocabulary_:
  if count_vect.vocabulary_.get(key) == max_freq_index:
    print(key, max_freq)
    break

the 5


In [11]:
X = X.todense()

In [12]:
X.shape

(2257, 35788)

## Unique y values (i.e the labels)

In [13]:
set(y)

{np.int64(0), np.int64(1), np.int64(2), np.int64(3)}

## Are the feature values scaled/normalized ?

In [14]:
X.min(), X.max()

(np.int64(0), np.int64(589))

## Lets normalize the features.

In [15]:
from sklearn.preprocessing import MinMaxScaler
feature_scaler = MinMaxScaler()
X_scaled = feature_scaler.fit_transform(np.asarray(X))

In [16]:
X_scaled.min(), X_scaled.max()

(np.float64(0.0), np.float64(1.0))

In [17]:
#make sure X is scaled
X = X_scaled

# Importing and fitting the PCA model

In [19]:
# importing PCA module
from sklearn.decomposition import PCA

In [20]:
# fitting the PCA model
pca = PCA(n_components=500)
pca.fit(X)

In [21]:
# perform copression
X_compressed = pca.transform(X)
X.shape, X_compressed.shape

((2257, 35788), (2257, 500))

In [22]:
# plotting the per component variances
fig = px.bar(x=np.arange(X_compressed.shape[1]), y=pca.explained_variance_ratio_)
fig.show()
print(np.sum(pca.explained_variance_ratio_))

0.6670908109317666


In [23]:
X.shape, X_compressed.shape

((2257, 35788), (2257, 500))

In [24]:
# Multi-Layer Perceptron
from sklearn.neural_network import MLPClassifier

In [25]:
# models dictionary
model_repo = {
    'lr': LogisticRegression(),
    'rf': RandomForestClassifier(),
    'gb': GradientBoostingClassifier(),
    'svm': SVC(),
    'nn': MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(35, 20, 10), random_state=1,
                    max_iter=500,
                    early_stopping=True)
}

# Model comparison with compressed data

In [26]:
# Data splits
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_compressed, y, test_size = 0.20, random_state=42)

## Support Vector Machines (SVMs)

In [34]:
my_model = myClassificationModel(model_repo['svm'])
my_model.train(X_train, y_train)
# Train, test accuracy
training_accuracy, test_accuracy = my_model.evaluate(X_train, y_train), my_model.evaluate(X_test, y_test)
training_accuracy, test_accuracy

(0.9174515235457064, 0.827433628318584)

In [30]:
# Cross validation
cv_scores = my_model.cv_error(X_compressed, y)
print('cross validation scores:', cv_scores)
print('cross validation score (mean):', np.mean(cv_scores))
print('cross validation score (std):', np.std(cv_scores))

cross validation scores: [0.85922975 0.83244681 0.83909574]
cross validation score (mean): 0.8435907669558174
cross validation score (std): 0.01138669915655106


# Feed Forward Neural Networks (NNs)

In [32]:
my_model = myClassificationModel(model_repo['nn'])
my_model.train(X_train, y_train)
# Train, test accuracy
training_accuracy, test_accuracy = my_model.evaluate(X_train, y_train), my_model.evaluate(X_test, y_test)
training_accuracy, test_accuracy

(1.0, 0.9269911504424779)

In [33]:
# Cross validation
cv_scores = my_model.cv_error(X_compressed, y)
print('cross validation scores:', cv_scores)
print('cross validation score (mean):', np.mean(cv_scores))
print('cross validation score (std):', np.std(cv_scores))

cross validation scores: [0.93359894 0.91489362 0.92952128]
cross validation score (mean): 0.9260046104000076
cross validation score (std): 0.008031083786488619
