<a href="https://colab.research.google.com/github/mdkamrulhasan/gvsu_machine_learning/blob/main/notebooks/Classification_documents_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

What will we cover today ?


1.  Document Classification

 *   Principal Component Analysis
 *   Neural Network Introduction


In [2]:
import numpy as np
import pandas as pd
# Models (Sklearn)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Data and Evaluation packages
from sklearn import datasets
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
# visualization
import plotly.express as px
from sklearn.model_selection import train_test_split

# Our Wrapper Class (Can take any model as an input)

In [3]:
from sklearn.model_selection import cross_val_score

class myClassificationModel:
  def __init__(self, model):
    self.model = model
    self.nb_cv_splits = 3
    self.evaluation_metrics = 'accuracy' #'f1'

  def train(self, X, y):
    self.model.fit(X, y)

  def evaluate(self, X, y):
    y_predict = self.model.predict(X)
    # return mean_squared_error(y, y_predict)
    return accuracy_score(y, y_predict)

  def cv_error(self, X, y):
    return cross_val_score(self.model,
                           X,
                           y, scoring=self.evaluation_metrics,
                           cv=self.nb_cv_splits)



[Data description](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

In [4]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [5]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [6]:
len(twenty_train.data), len(twenty_train.filenames)

(2257, 2257)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1, 1))
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [8]:
X_train_counts[0]

<1x35788 sparse matrix of type '<class 'numpy.int64'>'
	with 73 stored elements in Compressed Sparse Row format>

In [18]:
X = X_train_counts
y = twenty_train.target

In [19]:
np.sum(X[0])

98

Highest frequency tokens

In [20]:
query_record = 0# 100
x_0_dense = X[query_record].todense()

max_freq_index = x_0_dense.argmax()
max_freq = x_0_dense[0, x_0_dense.argmax()]

for key in count_vect.vocabulary_:
  if count_vect.vocabulary_.get(key) == max_freq_index:
    print(key, max_freq)
    break

the 5


In [21]:
X = X.todense()

In [22]:
X.shape

(2257, 35788)

## Unique y values (i.e the labels)

In [23]:
set(y)

{0, 1, 2, 3}

## Are the feature values scaled/normalized ?

In [24]:
X.min(), X.max()

(0, 589)

## Lets normalize the features.

In [25]:
from sklearn.preprocessing import MinMaxScaler
feature_scaler = MinMaxScaler()
X_scaled = feature_scaler.fit_transform(np.asarray(X))

In [26]:
X_scaled.min(), X_scaled.max()

(0.0, 1.0)

In [27]:
#make sure X is scaled
X = X_scaled

# Imporing and fitting the PCA model

In [28]:
# importing PCA module
from sklearn.decomposition import PCA

In [29]:
# fitting the PCA model
pca = PCA(n_components=500)
pca.fit(X)

In [30]:
# perform copression
X_compressed = pca.transform(X)
X.shape, X_compressed.shape

((2257, 35788), (2257, 500))

In [31]:
# plotting the per component variances
fig = px.bar(x=np.arange(X_compressed.shape[1]), y=pca.explained_variance_ratio_)
fig.show()
print(np.sum(pca.explained_variance_ratio_))

0.6671029509183745


In [32]:
X.shape, X_compressed.shape

((2257, 35788), (2257, 500))

# Model testing with raw data

In [69]:
# Multi-Layer Perceptron
from sklearn.neural_network import MLPClassifier

In [47]:
# models dictionary
model_repo = {
    'lr': LogisticRegression(),
    'rf': RandomForestClassifier(),
    'gb': GradientBoostingClassifier(),
    'svm': SVC(),
    'nn': MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(35, 20, 10), random_state=1,
                    max_iter=500,
                    early_stopping=True)
}

In [54]:
# Data splits
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_compressed, y, test_size = 0.20, random_state=42)

In [73]:
# testing a model type
my_model = myClassificationModel(model_repo['svm'])

In [74]:
# fitting the model
my_model.train(X_train, y_train)

In [75]:
# Train, test accuracy
training_accuracy, test_accuracy = my_model.evaluate(X_train, y_train), my_model.evaluate(X_test, y_test)
training_accuracy, test_accuracy

(0.9213296398891967, 0.831858407079646)

# CV with compressed data

In [52]:
cv_scores = my_model.cv_error(X_compressed, y)
print('cross validation scores:', cv_scores)
print('cross validation score (mean):', np.mean(cv_scores))
print('cross validation score (std):', np.std(cv_scores))

cross validation scores: [0.92563081 0.93351064 0.91489362]
cross validation score (mean): 0.9246783551373702
cross validation score (std): 0.0076301484535635416


# CV with raw data (all features)

In [50]:
cv_scores = my_model.cv_error(X, y)
print('cross validation scores:', cv_scores)
print('cross validation score (mean):', np.mean(cv_scores))
print('cross validation score (std):', np.std(cv_scores))

KeyboardInterrupt: 