Model Playground

In [20]:
# Load the saved model
import pickle
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
# Load the pipeline from the file
with open('results/cus_sw_nb_classifier.pkl', 'rb') as f:
    cus_pipeline = pickle.load(f)

with open('results/us_nb_classifier.pkl', 'rb') as f:
    us_pipeline = pickle.load(f)

In [47]:
import pandas as pd

def weighted_prediction(input_data, input_type='string', column_name='review'):
    # Mapping from numerical labels to string labels
    mapping = {0: "negative", 1: "positive"}

    # Check the type of input
    if input_type == 'string':
        texts = [input_data]
    elif input_type == 'csv':
        df = pd.read_csv(input_data)
        texts = df[column_name].tolist()
    elif input_type == 'dataframe':
        texts = input_data[column_name].tolist()
    else:
        raise ValueError(f"Unknown input_type: {input_type}")

    # Get predicted probabilities for each class from each model
    proba1 = cus_pipeline.predict_proba(texts)
    proba2 = us_pipeline.predict_proba(texts)

    # Compute weighted probabilities
    weighted_proba = 0.6 * proba1 + 0.4 * proba2

    # Predictions based on the weighted probabilities
    prediction_indices = weighted_proba.argmax(axis=1)
    
    # Convert numerical predictions to string labels
    string_predictions = [mapping[pred] for pred in prediction_indices]

    # If the input was a CSV or DataFrame, add the predictions to the DataFrame
    if input_type in ['csv', 'dataframe']:
        if input_type == 'csv':
            df['predictions'] = string_predictions
            df.to_csv('output_with_predictions.csv', index=False)
        else: # input_type is dataframe
            input_data['predictions'] = string_predictions
            return input_data

    return string_predictions


In [57]:
# print with 
prediction = weighted_prediction("this is a great product")
print(prediction[0])


positive


In [50]:
df = pd.read_csv('data/test.csv')
df

Unnamed: 0,review
0,worst produt is waste of money
1,"good deal , nice product"
2,make my skin smooth
3,long lasting fragrance
4,quick delivery thank you
5,"rude delivery agent, not staified with the exp..."


In [59]:
# predict with dataframe
result_df = weighted_prediction(df, input_type='dataframe')
print(result_df)

                                              review predictions
0                     worst produt is waste of money    negative
1                           good deal , nice product    positive
2                                make my skin smooth    positive
3                             long lasting fragrance    positive
4                          quick delivery thank you     positive
5  rude delivery agent, not staified with the exp...    negative


In [55]:
# predict with csv
print(weighted_prediction("data/test.csv", input_type='csv'))

['negative', 'positive', 'positive', 'positive', 'positive', 'negative']
