In [None]:
%config IPCompleter.greedy=True

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import pickle
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import keras
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras import layers
from keras.optimizers import SGD
from keras_preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras import regularizers

# Understanding and exploring data

## Data understanding and cleaning

<span style="color:red">Step 2</span>

In [None]:
# Step 2

data = pd.read_csv('21201394.csv')

In [None]:
data.head()

In [None]:
# I can observe the data imbalance using the value_counts func

print(data['category'].value_counts())
plt.figure(figsize=(8,5))
ax = sns.countplot(x="category", data=data)
ax.set_xlabel("Category", fontsize=12)
ax.set_ylabel("Count", fontsize=12)

In [None]:
# dropping un-nessary columns
# Dropped unnamed: 0 as already index is present
# Dropped link column as had repeated headline information with unique url id's

data.drop(['Unnamed: 0', 'link'], axis=1, inplace=True)

In [None]:
# knowing structure of the data columns

data.info()

In [None]:
# knowing columns

data.columns

In [None]:
# knowing size of the dataset

data.shape

In [None]:
data.isnull().sum()

<span style="color:red">Step 3 (iii)</span>

In [None]:
# Step 3 i, ii are below.
# There are many null records in the data and if I will drop them all then the size of the dataset would be affected badly
# Therefore, according to my observation, I will remove null records based on the potential independent columns only i.e. "headline", "short_description" and based on dependent column i.e. "category"
# dropping null records in the category, headline and short description columns 

data = data.dropna(subset=['category','headline','short_description'])
data.isnull().sum()

In [None]:
# replacing null author names with a blank value

data["authors"].fillna("", inplace = True)


In [None]:
# dropping duplicates record

data = data.drop_duplicates()

In [None]:
# adding CategoryId column for Categories to get it's numerical representation for ML modelling
# Home & Living denoted as 0
# Wellness denoted as 1
# Source: https://www.statology.org/pandas-factorize/

data['CategoryId'] = data['category'].factorize()[0]

In [None]:
#knowing shape after data preparation

data.shape

In [None]:
data.head(20)

## Data exploration

### Create and Fit Bag of Words Model to find most common words

<span style="color:red">Step 3 (i)</span>

In [None]:

# Finding words more common for each category using countvectorizer
vectorizer = CountVectorizer(stop_words='english')

# Week D10W1 (combination of all steps in a function)
def countVectorize(desc):
    description = vectorizer.fit_transform(desc)
    tokens_and_counts = zip(vectorizer.get_feature_names_out(), np.asarray(description.sum(axis=0)).ravel())
    df_tokens = pd.DataFrame(tokens_and_counts, columns=['Token', 'Count'])
    df_tokens.sort_values("Count", ascending=False, inplace=True)
    df_tokens.reset_index(inplace=True, drop=True)
    #Most Popular Tokens
    most_popular_tokens = df_tokens.nlargest(columns="Count", n=20)
    return most_popular_tokens


<span style="color:red">Step3 (ii) Column Selection</span>

In [None]:
# Reason:
# I think both the "headline" and "short description" columns contain important information to predict the correct topical category of the text.
# Therefore, combining them and storing them as a new column "description" to perform necessary NLP pre-processing on one column only.
# Also,
# I didn't include "authors" column as the same author can publish articles for both the categories and if I will perform tokenization on author name then many authors have same first names which will lead to incorrect analysis of categories.
# And didn't include "date" column as I cannot see a useful pattern in dates of publishing.

data["description"]=data["headline"]+" "+data["short_description"]
wellness=data[data["category"]=="WELLNESS"]["description"]
living=data[data["category"]=="HOME & LIVING"]["description"]

#Mean sentence length for both the categories
mean_len_wellness = wellness.str.len().mean()
mean_len_living = living.str.len().mean()

print("Mean sentence length: Wellness -> ",mean_len_wellness)
print("Mean sentence length: Home & Living -> ", mean_len_living)
print("WELLNESS > HOME & LIVING")

### Creating wordcloud representation

In [None]:
# Source: https://re-thought.com/creating-wordclouds-in-python/

def make_wordcloud(text):
    wordcloud = WordCloud(width=600, height=600).generate(text)
    plt.figure(figsize = (10,10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

###  Most popular keywords in wellness category

In [None]:
print("Most popular keywords in wellness category:\n",countVectorize(wellness))
df_new=countVectorize(wellness).to_csv('wellness_df',header=None)
df_wellness=pd.read_csv('wellness_df')
print("---------------------------------------------\nWellness Top Wordcloud:")
make_wordcloud(str(df_wellness))

###  Most popular keywords in home and living category

In [None]:
print("Most popular keywords in home and living category:\n",countVectorize(living))
hliving_top_words=countVectorize(living).to_csv('df_hliving',header=None)
df_hliving=pd.read_csv('df_hliving')
print("------------------------------------------------\nHome & Living Top Wordcloud:")
make_wordcloud(str(df_hliving))

# Data Preparation 

### Declared Dependent and Independent Value

<span style="color:red">Step 4</span>

In [None]:

# X is the independent variable
# y is the dependent variable
X = data.loc[:,"description"]
y = data.loc[:,"CategoryId"]


### Split the dataset into train, valid, test 

In [None]:
# I am dividing the dataset on the best approach suggested generally i.e. training = 70%, vaild = 15%, test = 15% of dataset as it's not a very large dataset.
# Source: https://www.v7labs.com/blog/train-validation-test-set
# In this step, I will first split the dataset into train_plus_valid and test dataset.
# Then split train_plus_valid into train dataset and valid dataset.

X_train_plus_valid, X_test, y_train_plus_valid, y_test = train_test_split(X, y, random_state=0, test_size = 0.15, train_size = 0.85)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_plus_valid, y_train_plus_valid, random_state=0, test_size = 0.15/0.85, train_size = 0.7/0.85)

In [None]:
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

### Concatenating and saving csv files for train, validation, test dataset

In [None]:
train_data = pd.concat([X_train,y_train], axis=1)
train_data

In [None]:
train_data=train_data.to_csv('train.csv')

In [None]:
valid_data = pd.concat([X_valid,y_valid], axis=1)
valid_data=valid_data.to_csv('valid.csv')

In [None]:
test_data=pd.concat([X_test,y_test], axis=1)
test_data=test_data.to_csv('test.csv')

### Pre-processing steps
Our data is text input so to extract meaningful results we need to perform some necessary pre-processing. As better quality of data will generate better results from models. 
 - Removing digits: When I searched the top words in each category, I found some numbers in it which is not                meaningful in predicting or differentiating the categories.
 - Removing tags: not useful
 - Removing special characters: not useful
 - Lowercasing text: helps with the consistency of expected output
 - Removing stopwords: do not provide meaningful information
 - Lemmatization: helps in grouping together the different inflected forms of a word. For e.g. "health" and "healthy"          to "health"

In [None]:
# Source: https://www.studytonight.com/python-howtos/remove-numbers-from-string-in-python

def remove_digits(text):
    text = re.sub(r'[0-9]','',text)
    return text


In [None]:
# Source: https://www.analyticsvidhya.com/blog/2021/12/text-classification-of-news-articles/

def remove_tags(text):    
    remove = re.compile(r'')
    text= re.sub(remove, '', text)
    text = re.sub(r'[^\w\s]','',text)
    return text

def special_char(text):
    reviews = ''
    for x in text:
        if x.isalnum():
            reviews = reviews + x
        else:
            reviews = reviews + ' '
    return reviews

def convert_lower(text):
    return text.lower()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [x for x in words if x not in stop_words]

def lemmatize_word(text):
    wordnet = WordNetLemmatizer()
    return " ".join([wordnet.lemmatize(word) for word in text])

### Loading train dataset to perform pre-processing steps on text 

<span style="color:red">Step 5</span>

In [None]:
# The reason and choices for performing the pre-processing on data is mentioned above.

train_data=pd.read_csv('train.csv')

In [None]:
train_data

In [None]:
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
train_data['description'] = train_data['description'].apply(remove_digits)
train_data['description'] = train_data['description'].apply(remove_tags)
train_data['description'] = train_data['description'].apply(special_char)
train_data['description'] = train_data['description'].apply(convert_lower)
train_data['description'] = train_data['description'].apply(remove_stopwords)
train_data['description'] = train_data['description'].apply(lemmatize_word)
train_data

### Loading evaluation dataset to perform pre-processing steps on text 

<span style="color:red">Step 5</span>

In [None]:
# The reason and choices for performing the pre-processing on data is mentioned above.

valid_data=pd.read_csv('valid.csv')

In [None]:
valid_data

In [None]:
valid_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
valid_data['description'] = valid_data['description'].apply(remove_digits)
valid_data['description'] = valid_data['description'].apply(remove_tags)
valid_data['description'] = valid_data['description'].apply(special_char)
valid_data['description'] = valid_data['description'].apply(convert_lower)
valid_data['description'] = valid_data['description'].apply(remove_stopwords)
valid_data['description'] = valid_data['description'].apply(lemmatize_word)
valid_data

### Finding more common words for each target variable in training set

In [None]:
# to take a look at most common words after pre-processing

print("Trainset - Home & Living popular keywords:")
countVectorize(train_data[train_data['CategoryId']==0]['description'])

In [None]:
print("Trainset - Wellness popular keywords:")
countVectorize(train_data[train_data['CategoryId']==1]['description'])

### Create and Fit Bag of Words Model using Tf-idf Vectorizer
As the machine learning models work on numerically represented data therefore we need the vectorizer.
- I chose TF-IDF vectorizer because it not only focuses on the frequency of words present in the corpus but also provides the importance of the words. Also, this technique is good for text classification or for helping a machine read words in numbers.
- I didn't add max_features parameter to it because if I will reduce the value of max_features, there is a high chance that the majority of words chosen will be from the majority class (Wellness). This makes sense since TF-IDF is selecting features based on term frequency alone and (Wellness) records are in majority.  
    
    Source: https://www.analyticsvidhya.com/blog/2021/09/creating-a-movie-reviews-classifier-using-tf-idf-in-python/

In [None]:
# Feature Engineering
vectorizer = TfidfVectorizer()

# Vectorizing training and validation dataset
# Fit the vectorizer on training dataset
vectorizer.fit(train_data['description'])

# Transforming training and validation dataset
X_train_matrix = vectorizer.transform(train_data['description'])
X_valid_matrix = vectorizer.transform(valid_data['description'])


In [None]:
# checking size of train data matrix generated

X_train_matrix.shape

In [None]:
# checking size of valid data matrix generated

X_valid_matrix.shape

# Data Modelling

## Here are my chosen binary classifiers:
1. Support Vector Classifier (SVC): As regarded as one of the best text classification algorithm according to the below sources and is best when the number of features are high as compared to a number of data points in the dataset. Also, it's faster, takes less memory and works good on small datasets.


2. Logistic Regression: Again regarded as one of the best binary classifier. In chapter 5: Logistic Regression of "Speech and Language Processing" by Daniel Jurafsky & James H. Martin. It is mentioned that in the NLP world, it’s generally accepted that Logistic Regression is a great starter algorithm for text related classification.

Therefore, I chose to study both these classfiers on my dataset.

Approach: Logistic Regression (LR) is a probabilistic classification model using the sigmoid function, whereas Support Vector Classifiers (SVC) are a more geometric approach that maximise the margins to each class. They are similar in that they both can divide the feature space with a decision boundary.

Source: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568#:~:text=Linear%20Support%20Vector%20Machine%20is,the%20best%20text%20classification%20algorithms.

Source 2: https://medium.com/axum-labs/logistic-regression-vs-support-vector-machines-svm-c335610a3d16#:~:text=SVM%20tries%20to%20finds%20the,are%20near%20the%20optimal%20point.

<span style="color:red">Step 6</span>

## Fitting the model

### SVC
#### Parameters:
1. **Linear** kernel is the most basic type of kernel, one dimensional in nature. It proves to be the best function when there are lots of features. The linear kernel is mostly preferred for text-classification problems as most of these kinds of classification problems can be linearly separated.
2. Regularization parameter, **C** must be strictly positive, default = 1.0
3. **Random_State** is set to **0** to get the reproducible results.

In [None]:
svc_model = SVC(kernel="linear", C=1.0, random_state=0)
svc_model.fit(X_train_matrix, y_train)

### Logistic Regression

#### Parameters:
1. I used **Liblinear** solver because it performs faster and better for data with high dimensionality. Also, it was the default historical approach to work with small datasets. Also, according to the following source, liblinear solver gave better accuracy.
       Source: https://towardsdatascience.com/dont-sweat-the-solver-stuff-aea7cddc3451

2. **Random_State** is set to **0** to get the reproducible results.

In [None]:
l_reg = LogisticRegression(solver='liblinear', random_state=0)
l_reg.fit(X_train_matrix,y_train)

<span style="color:red">Step 7</span>

## Deep Learning model: CNN


### Splitting the data into train, validation and test sets

In [None]:
# The split is according to standards mentioned above: Train: 70%, Validation: 15%, Test: 15%

X_train_plus_valid_1, X_test_1, y_train_plus_valid_1, y_test_1 = train_test_split(X, y, random_state=0, test_size = 0.15, train_size = 0.85)
X_train_1, X_valid_1, y_train_1, y_valid_1 = train_test_split(X_train_plus_valid_1, y_train_plus_valid_1, random_state=0, test_size = 0.15/0.85, train_size = 0.7/0.85)

In [None]:
# Printing their shapes

print(X_train_1.shape)
print(X_valid_1.shape)
print(X_test_1.shape)

In [None]:
# Concatenating x and y into respective datasets.

train_data_1 = pd.concat([X_train_1,y_train_1], axis=1)
valid_data_1 = pd.concat([X_valid_1,y_valid_1], axis=1)
test_data_1 = pd.concat([X_test_1,y_test_1], axis=1)

### Pre-processing on X train, validation and test dataset

In [None]:
# Performing above mentioned pre-processing tasks on train data

train_data_1['description'] = train_data_1['description'].apply(remove_digits)
train_data_1['description'] = train_data_1['description'].apply(remove_tags)
train_data_1['description'] = train_data_1['description'].apply(special_char)
train_data_1['description'] = train_data_1['description'].apply(convert_lower)
train_data_1['description'] = train_data_1['description'].apply(remove_stopwords)
train_data_1['description'] = train_data_1['description'].apply(lemmatize_word)
train_data_1

In [None]:
# Performing above mentioned pre-processing tasks on validation data

valid_data_1['description'] = valid_data_1['description'].apply(remove_digits)
valid_data_1['description'] = valid_data_1['description'].apply(remove_tags)
valid_data_1['description'] = valid_data_1['description'].apply(special_char)
valid_data_1['description'] = valid_data_1['description'].apply(convert_lower)
valid_data_1['description'] = valid_data_1['description'].apply(remove_stopwords)
valid_data_1['description'] = valid_data_1['description'].apply(lemmatize_word)
valid_data_1

In [None]:
# Performing above mentioned pre-processing tasks on test data

test_data_1['description'] = test_data_1['description'].apply(remove_digits)
test_data_1['description'] = test_data_1['description'].apply(remove_tags)
test_data_1['description'] = test_data_1['description'].apply(special_char)
test_data_1['description'] = test_data_1['description'].apply(convert_lower)
test_data_1['description'] = test_data_1['description'].apply(remove_stopwords)
test_data_1['description'] = test_data_1['description'].apply(lemmatize_word)
test_data_1

### Preparing word embedding for the deep learning model

In [None]:
# Tokenizer allows to vectorize a text corpus, by turning each text into either a sequence of integers(like index of token in a dict.) or into a vector where the coeficient of each token could be binary, based on word count, based on tf-idf
# Using keras tokenizer to build word embeddings and sequence preprocessing
# I am considering top 10,000 frequent words for tokenization and discard rare words

tokenizer = Tokenizer(num_words=10000)

# Fitting tokenizer on X_train data
tokenizer.fit_on_texts(X_train_1.tolist())

print(str(tokenizer.texts_to_sequences(["home is beautiful"])))

# The output depicts that most common words do not have a large index in our embedding space.
# Those whose occurrence is moderate will be given a moderate index value. Also, 0 value is reserved and won’t be provided to any text.


In [None]:
# Converting all the X, y data columns to list type for processing

X_train_1=X_train_1.tolist()
y_train_1=y_train_1.tolist()
X_valid_1=X_valid_1.tolist()
y_valid_1=y_valid_1.tolist()
X_test_1=X_test_1.tolist()
y_test_1=y_test_1.tolist()

In [None]:
# Converting words in the description to numercial sequences and storing them as arrays

x_train=np.array(tokenizer.texts_to_sequences(X_train_1))
x_valid=np.array(tokenizer.texts_to_sequences(X_valid_1))
x_test=np.array(tokenizer.texts_to_sequences(X_test_1))

In [None]:
# One problem is that in each sequence is the different length of words, and to specify the length of word sequence,
# we need to provide a maxlen parameter and to solve this, we need to use pad_sequence(),
# which simply pads the sequence of words with zeros. I am using maxlen=100 as after pre-processing as meaningful length of text is not more than 100.

maxlen = 100
Xcnn_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
Xcnn_valid = pad_sequences(x_valid, padding='post', maxlen=maxlen)
Xcnn_test = pad_sequences(x_test, padding='post',maxlen=maxlen)

In [None]:
print(Xcnn_train[0])

In [None]:
# Treating the label columns for train, validation and test data
# Converting then into categorical format as I am using loss metric as categorical_crossentropy later. Then storing them as arrays.

train_labels=np.asarray(keras.utils.to_categorical(y_train_1))
valid_labels=np.asarray(keras.utils.to_categorical(y_valid_1))
test_labels=np.asarray(keras.utils.to_categorical(y_test_1))
train_labels[0]

In [None]:
# Storing processed train and validation data as tensor slices so that it can be used while fitting the model. It will help in word masking when training the model.

train_ds=tf.data.Dataset.from_tensor_slices((Xcnn_train,train_labels))
valid_ds=tf.data.Dataset.from_tensor_slices((Xcnn_valid,valid_labels))

## Deep learning: Building the CNN model

Parameters
- vocab_size: The vocab_size is the same as number of maximum features (10,000) I am considering. Also, the extremely uncommon words will get a higher index value which will be word count (10,000) + 1 because they hold some information.
- embedding_dim: for each input word index present, the model will create a 64-bit embedding
- input_length: max_length of sequence of sentence as mentioned above
- embedding_regularizers.l2, bias_regularizers, kernel_regularizers: to reduce bias as unbalanced data
- Conv1D layer: 128 filters, kernel size=3 (i.e. for each iteration it will look at 3 words at a time), activation function Relu chosen for hidden layers as standard as it works faster as well as output good results.
- GlobalMaxPooling1D layer: Considers the maximum value out of all the word vectors so discards less weighted vectors.
- Last dense layer: Output 2 values as binary classification and to match the labels format, activation function Sigmoid used for binary classification

In [None]:
# Source Code: https://github.com/rsreetech/TextClassificationTensorFlowCNN/blob/master/TensorFlowTweetTextClassificationV1.ipynb


vocab_size=10000  
embedding_dim = 64

# I am using the Sequential model A Sequential model is appropriate for a plain stack of layers where each layer has exactly one input tensor and one output tensor. Source :https://www.tensorflow.org/guide/keras/sequential_model
textcnnmodel = Sequential()
textcnnmodel.add(layers.Embedding(vocab_size+1, embedding_dim, input_length=maxlen, embeddings_regularizer=regularizers.l2(0.0005)))
textcnnmodel.add(layers.Conv1D(128, 3, activation='relu',kernel_regularizer=regularizers.l2(0.0005),bias_regularizer=regularizers.l2(0.0005)))

# This is followed by a pooling layer that reduces the output of the convolutional layer by half.
textcnnmodel.add(layers.GlobalMaxPooling1D())
textcnnmodel.add(layers.Dense(10, activation='relu'))

# Dropout layers are important in training CNNs because they prevent overfitting on the training data.
textcnnmodel.add(layers.Dropout(0.5))

# The output layer uses a sigmoid activation function to output a value between 0 and 1 for the "Home & Living" and "Wellness" category.
textcnnmodel.add(layers.Dense(2, activation='sigmoid',kernel_regularizer=regularizers.l2(0.001),bias_regularizer=regularizers.l2(0.001)))

# Choice of optimizer: Stochastic Gradient Descent (SGD) seems to take advantage of its learning rate and momentum between each batch to optimize the model’s weights based on the information of the loss function in my case is 'categorical_crossentropy'.
# I set learning rate=0.1, as per the source - Source: https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1
opt = SGD(lr = 0.1)

# We use a binary cross entropy loss function because the problem we are learning is a binary classification problem. 
textcnnmodel.compile(optimizer = opt,
               loss = 'binary_crossentropy',
               metrics = ['accuracy','Precision','Recall'])
textcnnmodel.summary() 

## Deep learning: Fitting the text CNN model

Parameters:
- epochs: first, I tried with 100 epochs but the distance between validation loss and training loss increased so to reduce it I went with 50 epochs.
- verbose:1 shows the performance of each epoch
- batch_size: 128, was a better batch size for this model
- shuffle: False, to get reproducible results.

In [None]:
# Fitting on the training model
history=textcnnmodel.fit(train_ds.shuffle(2000).batch(128),
                     epochs=50,
                     verbose=1,
                     validation_data=valid_ds.batch(128))

## Deep learning: Evaluations of model
For evaluation of this CNN model, I will focus on the evaluation parameter **Recall**. From below code, I can say that the training results are showing as a perfect score so better than the validation results.

In [None]:
# Evaluating the training and validation accuracies
loss, accuracy, precision, recall = textcnnmodel.evaluate(Xcnn_train, train_labels, verbose=False)
print("Training ==> Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}".format(accuracy,precision,recall))
loss, accuracy, precision, recall = textcnnmodel.evaluate(Xcnn_valid, valid_labels, verbose=False)
print("Validation ==> Accuracy:  {:.4f}, Precision: {:.4f}, Recall: {:.4f}".format(accuracy,precision,recall))

In [None]:
history.history.keys()

In [None]:
# summarize history for accuracy

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()

In [None]:
# summarize history for loss

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
# plt.ylim((-0.1, 1.1))
plt.show()

### Observations:

- Based on the **model.evaluate** I can say that the model is performing well on classifying the data correctly. Talking about **recall**, both shows similar and good outcomes on both training and validation data.
- The accuracies for the both the training and validation data was constant for first 20 epochs and then increased at the same pace (parallel to each other) for the next 30 epochs, reaching to 95% and 93% respectively.
- Also, the model shows an improved performance on loss too. The loss for both the train and validation data keeps decreasing till the 50 epochs simultaneously and stabilizes at the end.

## Deep Learning: Predictions on train, validation and test data

In [None]:
print("******** Training Data ********")
# Make a set of predictions for the training data
y_pred = textcnnmodel.predict(Xcnn_train)
# y_pred has a predict value for each instance in Xcnn_train
print(y_pred.shape)

predict_results=y_pred.argmax(axis=1)

In [None]:
print("******** Training Data ********")

print(metrics.classification_report(y_train_1, predict_results.tolist()))

cm=confusion_matrix(y_train_1, predict_results.tolist())
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap="Blues"); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

In [None]:
print("******** Validation Data ********")
# Make a set of predictions for the validation data
y_pred = textcnnmodel.predict(Xcnn_valid)
# y_pred has a predict value for each instance in Xcnn_valid
print(y_pred.shape)

predict_results=y_pred.argmax(axis=1)

In [None]:
print("******** Validation Data ********")

print(metrics.classification_report(y_valid_1, predict_results.tolist()))

cm=confusion_matrix(y_valid_1, predict_results.tolist())
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap="Blues"); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

In [None]:
print("******** Test Data ********")
# Make a set of predictions for the test data
y_pred = textcnnmodel.predict(Xcnn_test)
# y_pred has a predict value for each instance in Xcnn_test
print(y_pred.shape)

predict_results=y_pred.argmax(axis=1)

In [None]:
print("******** Test Data ********")

print(metrics.classification_report(y_test_1, predict_results.tolist()))

cm=confusion_matrix(y_test_1, predict_results.tolist())
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap="Blues"); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

### Observations:

- The model performed well on all the train, validation and test data.
- In comparison, test data showed better performance than the validation set as misclassified 3 lesser data points than validation data for **Home & Living**. So, **recall** was slightly better is case of test data i.e. 85%.
- Recall for **Wellness** didn't showed any difference for both validation and test data. Also, precision was similar. Therefore, here because of improvement in Recall score the perdictions was better on test results.


<span style="color:red">Step 8</span>

# Data Evaluation: SVC & LR model

I have decided to choose **Confusion Matrix** as my evaluation metrics because of the following reasons:

1. Confusion matrices are not just useful in model evaluation but also model monitoring and model management.
2. General accuracy is often not enough information to allow you to decide on a model’s value.
3. Confusion matrices can help with side-by-side comparisons of different classification methods. You can see not only how accurate one model is over the other, but also see more granularly how a model does in sensitivity or specificity, as those might be more important factors than general accuracy itself. 
4. I will look at two factors mainly, __F1 score__ as it is the most common metrics used on unbalanced classification problems as compared to accuracy. Also, as F1 score is a harmonic mean between precision and recall. It will calculate the weighted average of both precision and recall.
5. Another one, I will be focussing on the **Recall** percentage which will be like my **primary metric** for minority category 0 (Home & Living). I will try to **increase Recall** score by making an effort to **decrease the false negatives** i.e. I will try to achieve this by classifying the documents which were not correctly assigned to the **Home & Living** category. I think these will be good benchmarks as both better precision and recall will lead to a better model. 

### SVC: Performance on training data

In [None]:
# Changing X and y to evaluate the results of the SVC model on training data:
X_cm = X_train_matrix
y_true_labels = y_train
model = svc_model

# Predicting the y-variable i.e. y_pred based on the fitted model on X_training data
# Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_train = model.predict(X_cm)
print("SVC ::\n")

print("Confusion matrix for training data:\n")
print(metrics.classification_report(y_true_labels, y_pred_train, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_train)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap = "OrRd"); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

### SVC: Performance on validation data



In [None]:
# Changing X and y to evaluate the results of the SVC model on validation data:

X_cm = X_valid_matrix
y_true_labels = y_valid

# Predicting the y-variable i.e. y_pred based on the fitted model on X_valid data
# Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_val = model.predict(X_cm)
print("SVC ::\n")

print("Confusion matrix for validation data:\n")
print(metrics.classification_report(y_true_labels, y_pred_val, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_val)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap = "OrRd"); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

### Logistic Regression: Performance on training data

In [None]:
# Changing X and y to evaluate the results of the Logistic Regression model on training data:
X_cm = X_train_matrix
y_true_labels = y_train
model_1 = l_reg

# Predicting the y-variable i.e. y_pred based on the fitted model on X_training data
#Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_train_l = model_1.predict(X_cm)
print("Logistic Regression::\n")
print("Confusion matrix for training data:\n")
print(metrics.classification_report(y_true_labels, y_pred_train_l, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_train_l)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap = "OrRd"); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

### Logistic Regression: Performance on validation data

In [None]:
# Changing X and y to evaluate the results of the Logistic Regression model on validation data:
X_cm = X_valid_matrix
y_true_labels = y_valid

# Predicting the y-variable i.e. y_pred based on the fitted model on X_validation data
#Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_val_l = model_1.predict(X_cm)

print("Logistic Regression::\n")
print("Confusion matrix for validation data:\n")
print(metrics.classification_report(y_true_labels, y_pred_val_l, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_val_l)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap = "OrRd"); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

<span style="color:red">Step 9</span>

### Performance Comparison: SVC v/s LR 

- The **Confusion Matrix** for both the models shows pretty good results on the train and validation data considering it an unbalanced dataset. We can also see that the precision is the major contributing factor behind getting a good f1-score. Precision for both the categories in the models is pretty good. However, **recall** performance differs for both the categories. It performs better for majority class "Wellness" as comaperd to minority class "Home & Living".
- We can also see that "SVC" performance is **better** as compared to the "LR" on both the training and validation data. The reason being, SVM tries to finds the “best” margin (distance between the line and the support vectors) that separates the classes and this reduces the risk of error on the data, while logistic regression does not, instead it can have different decision boundaries with different weights that are near the optimal point.
Source: https://medium.com/axum-labs/logistic-regression-vs-support-vector-machines-svm-c335610a3d16#:~:text=SVM%20tries%20to%20finds%20the,are%20near%20the%20optimal%20point.
- Also, as there is not much difference between the training and validation **accuracies** that means that both the models aren't **overfitting**. It can be due to the presence of regularization parameter in both the models. However, as expected training f1 scores and accuracies are greater than that of validation data.
Source: https://www.quora.com/How-do-we-know-whether-a-model-is-overfitting
- Another thing to notice from the classification report is that the **Recall** score for "Home & Living" category is far more less for "Logistic Regression" model as compared to the "SVC" model. In SVC model, about 28 (Train), 23 (Validation) data points were misclassified as Wellness, however, in LR model the count rose to 255 (Train), 51 (Validation) according to the confusion matrix, which proves the second point.
- Cumulative F1-scores for both the models on both the training and validation data is approximately **1**.

### Error Analysis

### SVC

#### On training set

In [None]:
train_data['category_pred'] = y_pred_train
train_analysis = train_data.loc[(train_data['CategoryId']==0) & (train_data['category_pred']==1)]
#train_analysis.to_csv("err_train_svc.csv")

#### On validation set

In [None]:
valid_data['category_pred'] = y_pred_val
val_analysis = valid_data.loc[(valid_data['CategoryId']==0) & (valid_data['category_pred']==1)]
#val_analysis.to_csv("err_val_svc.csv")

### Logistic Regression

#### On training set

In [None]:
train_data['category_pred'] = y_pred_train_l
train_analysis = train_data.loc[(train_data['CategoryId']==0) & (train_data['category_pred']==1)]
#train_analysis.to_csv("err_train_lr.csv")

#### On validation set

In [None]:
valid_data['category_pred'] = y_pred_val_l
val_analysis = valid_data.loc[(valid_data['CategoryId']==0) & (valid_data['category_pred']==1)]
#val_analysis.to_csv("err_val_lr.csv")

<span style="color:red">Step 10</span>

### Observations

- Almost all the records incorrectly predicted by the SVC model were present in the Logistic Regression error analysis csv file. After studying those records I found that many words mentioned in those sentences like "incandescent","autumn","shipping","nifty" etc. were rare and unique. Therefore, I assume that they were mis-classified because of their negligible count and classified to the majority class i.e. "Wellness" as weighted more.
- Furthermore, records containing popular and common words like "new","make","day","time" etc. were bound to be mis-classified as "Wellness" category because these have much higher count as compared in the "Home&Living" category as it has less records. Therefore, again due to imbalance of data records got misclassified. Refer to Step 3(i) or most common words after pre-processing.
- We can say that both the models have potential to perform better if we reduce the imbalance.

<span style="color:red">Step 11</span>


### First Improvement: In Feature Engineering

I will set **max_df** parameter in TF-IDF vectorizer to 0.8 so that it doesn't consider most popular common words like "new", "make", "day" for both the categories. It will build a vocabulary  that ignore terms that have a document frequency strictly higher than the given threshold i.e. 0.8

Furthermore, I will set **min_df** as 2, which means "ignore terms that appear in less than 2 document" to remove rare frequency words.

**Sublinear tf-scaling** is set to True, to scale down the weight of term algorithm as it is occurring multiple times(maximum tf) in first document.

In [None]:
vectorizer_1 = TfidfVectorizer(max_df=0.8, min_df=2, sublinear_tf=True)

# Vectorizing training and validation dataset
#Fit the vecotrizer on training dataset
vectorizer_1.fit(train_data['description'])

#Transforming training and validation dataset
X_train_matrix_1 = vectorizer_1.transform(train_data['description'])
X_valid_matrix_1 = vectorizer_1.transform(valid_data['description'])

### SVC

In [None]:
svc_model_1 = SVC(kernel="linear", C=1.0, random_state=0)
svc_model_1.fit(X_train_matrix_1, y_train)

#### SVC: Training data performance

In [None]:
# Changing X and y to evaluate the results of the SVC model on training data:
X_cm = X_train_matrix_1
y_true_labels = y_train
model = svc_model_1

# Predicting the y-variable i.e. y_pred based on the fitted model on X_training data
# Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_train = model.predict(X_cm)
print("SVC ::\n")

print("Confusion matrix for training data:\n")
print(metrics.classification_report(y_true_labels, y_pred_train, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_train)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap='Spectral'); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

#### SVC: Validation data performance

In [None]:
# Changing X and y to evaluate the results of the SVC model on validation data:

X_cm = X_valid_matrix_1
y_true_labels = y_valid
# Predicting the y-variable i.e. y_pred based on the fitted model on X_valid data
# Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_val = model.predict(X_cm)
print("SVC ::\n")

print("Confusion matrix for validation data:\n")
print(metrics.classification_report(y_true_labels, y_pred_val, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_val)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap='Spectral'); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

### Logistics Regression

In [None]:
l_reg_1 = LogisticRegression(solver='liblinear', random_state=0)
l_reg_1.fit(X_train_matrix_1,y_train)

#### Logistic Regression: Training data performance

In [None]:
# Changing X and y to evaluate the results of the Logistic Regression model on training data:
X_cm = X_train_matrix_1
y_true_labels = y_train
model_1 = l_reg_1

# Predicting the y-variable i.e. y_pred based on the fitted model on X_training data
#Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_train_l = model_1.predict(X_cm)
print("Logistic Regression::\n")
print("Confusion matrix for training data:\n")
print(metrics.classification_report(y_true_labels, y_pred_train_l, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_train_l)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap='Spectral'); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

#### Logistic Regression: Validation data performance

In [None]:
# Changing X and y to evaluate the results of the Logistic Regression model on validation data:
X_cm = X_valid_matrix_1
y_true_labels = y_valid

# Predicting the y-variable i.e. y_pred based on the fitted model on X_validation data
#Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_val_l = model_1.predict(X_cm)

print("Logistic Regression::\n")
print("Confusion matrix for validation data:\n")
print(metrics.classification_report(y_true_labels, y_pred_val_l, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_val_l)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap='Spectral'); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

### Observations

- Changing the parameters in the TF-IDF vectorizer helped to increase the **Recall** percentage of "Home & Living " cateogry in the **Logistic Regression** model for both the training and validation datasets. Initially, it was 82% and 83% for train and val respectively however after adding min_df, max_df as parameters it improved to 82% and 83% respectively. Therefore, improving the number of correct predictions by a small margin i.e. decrease in **false negatives** for Home & Living category. 

- Furthermore, the **Recall** percentage for **SVC**'s training and validation data reduced by 1% respectively for the "Home & Living" category respectively. Eariler, it misclassified 23 records and after the parameter change, it increased to 25, while no changes in other predictions related to the Wellness category. Even after a slight difference in recall and precision, model is still providing good f1-score as output.

<span style="color:red">Step 11</span>

### Second Improvement: Changes in the model parameters

### SVC

#### Parameters
- I changed **C** parameter i.e. penality/regularization to **2** to choose a slightly smaller margin so that the model can avoid misclassifying data. I just increased it by 1 because I do not want the model to over-fit. I can further decide a good C value after cross-validation, if SVC comes out to be a better model.

Source: https://stackoverflow.com/questions/12809633/parameter-c-in-svm-standard-to-find-best-parameter
- I added the **class_weight** parameter to encounter the data imbalance issue. By setting it to **balanced**, it basically means replicating the smaller class until you have as many samples as in the larger one, but in an implicit way.
The argument class_weight='balanced' penalizes mistake on the minority class by an amount proportional to how under-represented it is.

Source: https://stackoverflow.com/questions/30972029/how-does-the-class-weight-parameter-in-scikit-learn-work

In [None]:
svc_model_1 = SVC(kernel="linear", C=2.0, random_state=0, class_weight='balanced')
svc_model_1.fit(X_train_matrix_1, y_train)

#### Performance on training data

In [None]:
# Changing X and y to evaluate the results of the SVC model on training data:
X_cm = X_train_matrix_1
y_true_labels = y_train
model = svc_model_1

# Predicting the y-variable i.e. y_pred based on the fitted model on X_training data
#Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_train = model.predict(X_cm)
print("SVC ::\n")

print("Confusion matrix for training data:\n")
print(metrics.classification_report(y_true_labels, y_pred_train, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_train)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

#### Performance on validation data

In [None]:
# Changing X and y to evaluate the results of the SVC model on validation data:

X_cm = X_valid_matrix_1
y_true_labels = y_valid
# Predicting the y-variable i.e. y_pred based on the fitted model on X_valid data
#Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_val = model.predict(X_cm)
print("SVC ::\n")

print("Confusion matrix for validation data:\n")
print(metrics.classification_report(y_true_labels, y_pred_val, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_val)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

### Logistic Regression

#### Parameters:
- I added the **class_weight** parameter to encounter the data imbalance issue. By setting it to **balanced**, it basically means replicating the smaller class until you have as many samples as in the larger one, but in an implicit way.
The argument class_weight='balanced' penalizes mistake on the minority class by an amount proportional to how under-represented it is.

Source: https://stackoverflow.com/questions/30972029/how-does-the-class-weight-parameter-in-scikit-learn-work


In [None]:
l_reg_1 = LogisticRegression(solver='liblinear', random_state=0, class_weight='balanced')
l_reg_1.fit(X_train_matrix_1,y_train)

#### Performance on training data

In [None]:
# Changing X and y to evaluate the results of the Logistic Regression model on training data:
X_cm = X_train_matrix_1
y_true_labels = y_train
model_1 = l_reg_1

# Predicting the y-variable i.e. y_pred based on the fitted model on X_training data
#Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_train_l = model_1.predict(X_cm)
print("Logistic Regression::\n")
print("Confusion matrix for training data:\n")
print(metrics.classification_report(y_true_labels, y_pred_train_l, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_train_l)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

#### Performance on validation data

In [None]:
# Changing X and y to evaluate the results of the Logistic Regression model on validation data:
X_cm = X_valid_matrix_1
y_true_labels = y_valid

# Predicting the y-variable i.e. y_pred based on the fitted model on X_validation data
#Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_val_l = model_1.predict(X_cm)

print("Logistic Regression::\n")
print("Confusion matrix for validation data:\n")
print(metrics.classification_report(y_true_labels, y_pred_val_l, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_val_l)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

### Observations

- Adding the **class weight = 'balanced'** parameter improved the performance greatly as shown above, as well as the **recall percentage increased** for both the models in the classification report. Before this, **SVC** was performing better than Logistic regression model but now they both are showing similar f1 scores (differs by 1% for Home & Living) that means that adding class weight has helped the models in better classifications as compared to before as treats the imbalance. The mis-classification in confusion matrix of "Home and Living" reduced from 25 to 21.
- The **Logistic Regression** model improved more as compared to the SVC model. The **recall** probability score for "Home and Living" category rose to 94% from 83% and also without much compromising the precision value i.e. **false negatives** reduced from 48 to 16.
- I can say that the benchmark was achieved as I was able to get a better performance of the **Confusion Matrix** by increasing the recall score which indeed increase the f1- score for both the models.

### Saving both the models

In [None]:
with open('svc_model.pkl','wb') as f:
    pickle.dump(svc_model_1,f)

In [None]:
with open('lr_model.pkl','wb') as f:
    pickle.dump(l_reg_1,f)

<span style="color:red">Step 12</span>

### Cross Validation on Train+Validation data

I am using K-Folds technique as it's popular, easy to understand, it generally results in a less biased model compare to other methods. As it ensures that every observation from the original dataset has the chance of appearing in training and validation set.
This is one among the best approach if we have a limited input data.

In [None]:
train_val_data = pd.concat([train_data, valid_data])
train_val_data.reset_index(drop=True, inplace=True)

In [None]:
train_val_data.drop('category_pred', axis=1, inplace=True)
train_val_data

In [None]:
# I am using n_splits as 10 to expect lower prediction error
folder = KFold(n_splits=10)

In [None]:
vectorizer_2 = TfidfVectorizer(max_df=0.8, min_df=2, sublinear_tf=True)

# Vectorizing training and validation dataset
#Fit the vecotrizer on training dataset
vectorizer_2.fit(train_val_data['description'])

#Transforming training and validation dataset
X_train_val_matrix = vectorizer_2.transform(train_val_data['description'])

#### SVC: Cross Validation

In [None]:
svc = SVC(kernel="linear", C=2.0, random_state=0, class_weight='balanced')


In [None]:
results = cross_val_score(svc ,X_train_val_matrix,train_val_data['CategoryId'], cv = folder)# Parameters are, in order - model, X data, labels (y), and the KFold object
results

In [None]:
output = pd.DataFrame(results, columns=["Accuracy_per_fold"])
output

In [None]:
av_val = output["Accuracy_per_fold"].mean()*100
print("SVC: The average accuracy across folds is {}%".format(round(av_val,2)))

#### Logistic Regression: Cross Validation

In [None]:
l_reg_3 = LogisticRegression(solver='liblinear', random_state=0, class_weight='balanced')

In [None]:
results_lr = cross_val_score(l_reg_3 ,X_train_val_matrix,train_val_data['CategoryId'], cv = folder)# Parameters are, in order - model, X data, labels (y), and the KFold object
results_lr

In [None]:
output = pd.DataFrame(results_lr, columns=["Accuracy_per_fold"])
output

In [None]:
av_val_1 = output["Accuracy_per_fold"].mean()*100
print("Logistic Regression: The average accuracy across folds is {}%".format(round(av_val_1,2)))

<span style="color:red">Step 13</span>

### Observations

After performing cross-validation on both the classifers, I got that logistic regression model is performing slightly better the SVC classifier. One of the fold observation of SVC is predicting 93.9% accuracy while none of the accuracies of Logistic Regression came down more than 2 percent of the maximum predicted accuracy.

Therefore, I will apply the **Logistic Regression** model on test data.

### Loading and processing test data

In [None]:
test_data=pd.read_csv('test.csv')

In [None]:
test_data.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
#Applying same pre-processing steps on test data
test_data['description'] = test_data['description'].apply(remove_digits)
test_data['description'] = test_data['description'].apply(remove_tags)
test_data['description'] = test_data['description'].apply(special_char)
test_data['description'] = test_data['description'].apply(convert_lower)
test_data['description'] = test_data['description'].apply(remove_stopwords)
test_data['description'] = test_data['description'].apply(lemmatize_word)

In [None]:
X_test=test_data.loc[:,'description']
y_test=test_data.loc[:,'CategoryId']

In [None]:
X_test_matrix = vectorizer_1.transform(X_test)

In [None]:
with open('lr_model.pkl', 'rb') as f:
    log_reg = pickle.load(f)

In [None]:
# Changing X and y to evaluate the results of the Logistic Regression model on test data:
X_cm = X_test_matrix
y_true_labels = y_test
model = log_reg

# Predicting the y-variable i.e. y_pred based on the fitted model on X_test_matrix data
#Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_test = model.predict(X_cm)
print("Logistic Regression ::\n")

print("Confusion matrix for test data:\n")
print(metrics.classification_report(y_true_labels, y_pred_test, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_test)
plt.figure(figsize=(12,6))

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

### Observations: Validation v/s Test Performance

The test data results like precision, f1-scores, accuracy were similar to that of the validation data with some slight variation in the classification i.e. scores on validation data showed slightly better performance than on test data. For instance, if accuracy for validation is predicted 97% then for test it is 96%. And same cases with precision and f1-scores.

While if I consider **false negative** (28 data points aren't predicted as Home & Living even though they are) in the confusion matrix then it's quite different. As you can see above that 16 data points of Home & Living category were mis-classified as Wellness for validation data whereas 28 data points were mis-classified as Wellness for test data. Therefore, model showed **better Recall** score on **validation dataset as compared to test dataset**. However, the cumulative accuracies and f1-scores doesn't vary much.

I think it's because the model's hyperparameters were tuned specifically for the validation dataset. Overall, I will consider them as good results.

<span style="color:red">Step 14</span>

### Re-training LR model on Train+Validation data

In [None]:
l_reg_final = LogisticRegression(solver='liblinear', random_state=0, class_weight='balanced')
l_reg_final.fit(X_train_val_matrix,train_val_data['CategoryId'])

In [None]:
X_test_matrix_2 = vectorizer_2.transform(test_data['description'])

In [None]:
# Changing X and y to evaluate the results of the Logistic Regression model on test data:
X_cm = X_test_matrix_2
y_true_labels = y_test
model = l_reg_final

# Predicting the y-variable i.e. y_pred based on the fitted model on X_test_matrix_2 data
#Plotting and visualizing the true lables and predicted labels in the confusion matrix
y_pred_test = model.predict(X_cm)
print("Logistic Regression ::\n")

print("Confusion matrix for test data:\n")
print(metrics.classification_report(y_true_labels, y_pred_test, target_names=['Home & Living', 'Wellness']))

cm=confusion_matrix(y_true_labels, y_pred_test)
plt.figure(figsize=(12,6))
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap="Blues"); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Home & Living', 'Wellness']); ax.yaxis.set_ticklabels(['Home & Living', 'Wellness']);

### Observations

I didn't find much difference in the classification report metrics for test data evaluations when tested on model fitted with **train data vs train+validation** data. The only changes which I can notice is that **Recall** score for "Home & Living" has increased when tested against train+validation data. Earlier it was 91% and now it's 92%.

Moreover, in the confusion matrix you can see that after using more data, it is classifying 3 more correct classifications for the minority class (Home&Living) without affecting majority class (Wellness). So, **increased recall** score on Home & Living category.

### Let's predict category of new articles

In [None]:
# Home & Living example: https://www.huffpost.com/entry/how-to-make-home-fancy-hotel_l_62b21aa0e4b06594c1dc6306
# Headline+Short Description

example="How To Make Your Home Feel Like A Fancy Hotel Want to re-create that relaxing vacation environment back home? Follow these expert-backed tips."
example=example.lower()

In [None]:
my_vec=vectorizer_2.transform([example])
text_arr=my_vec.toarray()

In [None]:
#Correctly predicted the category i.e. 0 = Home & Living

l_reg_final.predict(text_arr)

In [None]:
# Wellness category example: https://www.huffpost.com/entry/habits-causing-back-pain_l_62c864ece4b0d740198339e8
# Headline+Short Description

example_2="10 Mindless Habits That May Be Causing You Back Pain If your upper, middle or lower back aches, these behaviors might be the culprit."
example_2=example_2.lower()

In [None]:
my_vec=vectorizer_2.transform([example_2])
text_arr_2=my_vec.toarray()

In [None]:
#Correctly predicted the category i.e. 1 = Wellness

l_reg_final.predict(text_arr_2)