# Test Your Model

In [1]:
import pickle
import pandas as pd
import re, string, nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
# reload your model and use it to make predictions for test text
# you should adjust the code so as to load to your saved model/components
def test_trained_model(model_path, test_text):
    saved_model_dic = pickle.load(open(model_path,"rb"))
    saved_clf = saved_model_dic['model']
    saved_vectorizer = saved_model_dic['vectorizer']
    print(len(saved_vectorizer.vocabulary))
    new_test_vecs = saved_vectorizer.fit_transform(test_text)
    return saved_clf.predict(new_test_vecs)

# Preprocessing functions

As like the training steps, I have created a preprocessing function that will do the same stuff as the training steps with a pattern and stripping whitespace with converting to lowercase and replacing opostrophes with punctatuation and stopword removal.

In [2]:
def text_preprocessor(data):
    
    data_copy = data
    
    pattern = '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'
    data = [re.sub(re.compile(pattern), '', new_line) for new_line in data_copy]

    # strip whitespace and converting to lower case
    lower_case = [new_line.strip().lower() for new_line in data_copy]
    
    # Replace apostrophes with words
    processed = []
    for new_line in lower_case:
        new_line = new_line.replace("-", " ")
        words = [negate_handle[new_word] if new_word in negate_handle else new_word for new_word in new_line.split()]
        processed.append(" ".join(words))
        
    #remove punctuation
    processed = [new_line.translate(str.maketrans('', '', string.punctuation)) for new_line in processed] 
    
    # removing stopwords
    stops = set(stopwords.words("english"))
    processed = [" ".join([new_word for new_word in new_line.split() if new_word not in stops]) for new_line in processed]
    
    return processed

def stem_lemmatize(data):
    
    data_copy = data
    
    # lemmatization
    lemmatizer=WordNetLemmatizer()
    processed = [" ".join([lemmatizer.lemmatize(new_word) for new_word in new_line.split()]) for new_line in data_copy]
    
    return processed

# Getting data to work with 

Basically, when I created the training algorithm and saved it, I have saved the classifier and the handle that I worked with the feature names and everything needed so in this step I am retrieving the information with transforming the sentiment column to 1 and 0. After that, getting the saved classifier with the handle and transforming the data text to list and getting 5000 data from it

In [3]:
# load sample test data
test_data = pd.read_csv('coursework1_train.csv')
if('Unnamed: 0' in test_data.columns):
    del test_data['Unnamed: 0']
    
test_data['sentiment'] = test_data.sentiment.map(lambda x: int(1) if x =='pos' else int(0))

# preprocessing unseen data
with open("sample_trained_model.pickle", "rb") as f:
    saved_file_comp = pickle.load(f)
    
saved_clf = saved_file_comp['model']
negate_handle = saved_file_comp["negate_handle"]
feature = saved_file_comp["feature_names"]
data_processed = text_preprocessor(test_data['text'].tolist())
data_processed = stem_lemmatize(data_processed)
test_data['text'] = data_processed
    
test_text = test_data['text'].tolist()[-5000:]
y_test = test_data['sentiment'].tolist()[-5000:]

print('the size of the data is:', len(y_test))

the size of the data is: 5000


# Testing the model

in the final step, I have tested the model and getting the accuracy of it

In [4]:
from sklearn.metrics import accuracy_score
pred_for_new_test = test_trained_model("sample_trained_model.pickle", test_text)
acc = accuracy_score(y_test, pred_for_new_test)
print('the accuracy is:', acc)

# confusion matrix
print(confusion_matrix(y_test, pred_for_new_test))

# classification_report
print(classification_report(y_test, pred_for_new_test))

5000
the accuracy is: 0.8458
[[2052  438]
 [ 333 2177]]
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      2490
           1       0.83      0.87      0.85      2510

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000

