# model inference

In [24]:
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from classes import preprocess

folder_name = 'context base model v4/svm'
model = 'svm'
df_file = 'model'

# Preprocess the input text
maharlika = preprocess(df='raw data/Maharlika Fund.csv', added_stopwords='filipino_stopwords.csv', comment_column='Comment', dups=1)

x_test = maharlika.df['features_string_format']

# Load the SVM model
svm_model = joblib.load('{}/{}_model_context_base_{}.pkl'.format(folder_name, model, df_file))

# Load the vectorizer used during training
vectorizer = joblib.load('{}/{}_vectorizer_context_base_{}.pkl'.format(folder_name, model, df_file))

# Load the label encoder
label_encoder = joblib.load('{}/{}_label_encoder_context_base_{}.pkl'.format(folder_name, model, df_file))

# Apply the same vectorizer used in training to transform the test data
xtest_vectors = vectorizer.transform(x_test)

# Make predictions using the SVM model
predictions = svm_model.predict(xtest_vectors)

# Convert predictions to integer type
predictions = predictions.astype(int)

# Decode the predicted labels using the label encoder
decoded_predictions = label_encoder.inverse_transform(predictions)

# Print the predicted label
maharlika.df['context_base_prediction'] = decoded_predictions

# convert numeric value to sentiments
sentiment_dict = {
            1: 'neutral',
            2: 'positive',
            0: 'negative',
        }
maharlika.df['context_base_prediction'] = maharlika.df['context_base_prediction'].map(sentiment_dict)

Column Headers:  ['Comment', 'Comment Validation', 'Tally for total']
shape of raw dataframe:  (531, 3)
shape of dataframe when null comments were dropped:  (531, 5)
shape of dataframe when preprocessed and duplicated values where dropped:  (492, 5)
shape of final dataframe when rows that have null values where dropped:  (491, 5)


In [25]:
maharlika.df.head(20)

Unnamed: 0,Comment,Comment Validation,Tally for total,features,features_string_format,context_base_prediction
0,hi sen pia sna po pauwiin nyu n si robin wla n...,Negative,,"[pia, sna, pauwiin, nyu, robin, wla, namn, ata...",pia sna pauwiin nyu robin wla namn ata ngagawa...,negative
1,Lage Naman,Negative,,[lage],lage,positive
2,"Correct ka jan, kasalanan ni bbm na Naman hahaha",Negative,,"[correct, jan, kasalanan, bbm]",correct jan kasalanan bbm,negative
3,kasalanan to ni PBBM,Negative,,"[kasalanan, pbbm]",kasalanan pbbm,negative
4,huh kwento mo sa pagong,Negative,,"[huh, kwento, pagong]",huh kwento pagong,neutral
5,d puro project lang yarn,Negative,,"[puro, project, yarn]",puro project yarn,negative
6,Sus puro pasikat lang,Negative,,"[sus, puro, pasikat]",sus puro pasikat,negative
7,dont even get me started on robin padilla,Negative,,"[dont, even, get, started, robin, padilla]",dont even get started robin padilla,negative
8,Puro kase dilawan ang nasa Senado,Negative,,"[puro, kase, dilawan, nasa, senado]",puro kase dilawan nasa senado,negative
9,reject the maharlika funds,Negative,,"[reject, maharlika, funds]",reject maharlika funds,negative


In [27]:
maharlika.df['Comment Validation'].value_counts()

Neutral     419
Positive     37
Negative     35
Name: Comment Validation, dtype: int64

In [28]:
maharlika.df['context_base_prediction'].value_counts()

negative    296
positive    172
neutral      23
Name: context_base_prediction, dtype: int64

In [None]:
# 2 stands for positive, 1 for neutral, 0 for negative