# trying different zero shot methods and different classification methods afterwards

2. Improving sentiment analysis 
  - add various zero shot methods + various after model (logistic regression, SVM etc) 
  - create a service with the above for easier integration with existing solution 
  - develop model performance visualisation 
    - design in figma
    - implement the design 
  - create a model download page 
     - create in figma 
     - implement the design 

In [4]:
import pandas as pd
from transformers import pipeline

def sentiment_analysis(df, text_column, model_name):
    classifier = pipeline("zero-shot-classification", model=model_name)
    labels = ["positive", "neutral", "negative"]

    def classify_text(text):
        if pd.isna(text):
            return {"labels": [None], "scores": [None]}
        return classifier(text, labels)

    results = df[text_column].apply(classify_text)
    df['predicted_label'] = results.apply(lambda x: x['labels'][0])
    df['scores'] = results.apply(lambda x: x['scores'][0])

    return df

df = pd.DataFrame({
    'Review Text': ["I love the new design of your website!", "The service was okay, nothing special.", "I am not happy with the product quality.", None]
})

# DeBERTa
df_deberta = sentiment_analysis(df, 'Review Text', 'microsoft/deberta-large-mnli')
print("Results using DeBERTa:")
print(df_deberta)

# BART
df_bart = sentiment_analysis(df, 'Review Text', 'facebook/bart-large-mnli')
print("\nResults using BART:")
print(df_bart)

# Ernie
df_ernie = sentiment_analysis(df, 'Review Text', 'MoritzLaurer/ernie-m-large-mnli-xnli')
print("\nResults using ernie:")
print(df_ernie)

RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

In [None]:
cloth_df = pd.read_csv('/Users/sriyan/Documents/techjam/final-sentiment-analysis/data/unlabelled_clothes_reviews.csv')
cloth_df.head()

Unnamed: 0.1,Unnamed: 0,Review Text
0,0,Absolutely wonderful - silky and sexy and comf...
1,1,Love this dress! it's sooo pretty. i happene...
2,2,I had such high hopes for this dress and reall...
3,3,"I love, love, love this jumpsuit. it's fun, fl..."
4,4,This shirt is very flattering to all due to th...


In [None]:
cloth_df.drop(columns=['Unnamed: 0'], inplace=True)
cloth_df.head()

Unnamed: 0,Review Text
0,Absolutely wonderful - silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...
2,I had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl..."
4,This shirt is very flattering to all due to th...


In [None]:
# using only Deberta for now 
cloth_df_label = sentiment_analysis(cloth_df, 'Review Text', 'microsoft/deberta-large-mnli')
cloth_df_label.head()

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


KeyboardInterrupt: 

In [47]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import scipy.stats as stats

# Extended sample dataframe with text data
df_deberta = pd.DataFrame({
    'Review Text': [
        "I love the new design of your website!",
        "The service was okay, nothing special.",
        "I am not happy with the product quality.",
        "The delivery was fast and the packaging was great.",
        "Customer support was not helpful.",
        "Amazing experience, will definitely come back!",
        "The price is too high for the value provided.",
        "The user interface is very intuitive and easy to use.",
        "I had a bad experience with the return process.",
        "The features offered are exactly what I needed.",
        "The quality of the product exceeded my expectations.",
        "Terrible customer service, very disappointed.",
        "The app crashes frequently, needs improvement.",
        "Good value for money.",
        "The staff were friendly and accommodating.",
        "The product arrived damaged.",
        "I love the color options available.",
        "The software is full of bugs.",
        "Fast and efficient service.",
        "The warranty policy is not clear.",
        "Very satisfied with the purchase.",
        "The website is slow and unresponsive.",
        "Excellent customer service!",
        "The product stopped working after a week.",
        "I am very pleased with the overall experience."
    ],
    'predicted_label': [
        "positive", "neutral", "negative", "positive", "negative",
        "positive", "negative", "positive", "negative", "positive",
        "positive", "negative", "negative", "positive", "positive",
        "negative", "positive", "negative", "positive", "negative",
        "positive", "negative", "positive", "negative", "positive"
    ]
})

# Preprocess the text data to remove empty texts
df_deberta['Review Text'] = df_deberta['Review Text'].fillna('')
df_deberta = df_deberta[df_deberta['Review Text'].str.strip() != '']

# Ensure the correct column name
text_column = 'Review Text'

# Sample initial labeled dataset
initial_labeled_data = df_deberta.sample(frac=0.2, random_state=42)
unlabeled_data = df_deberta.drop(initial_labeled_data.index)

# Vectorize text data
vectorizer = TfidfVectorizer(stop_words='english')
X_initial = vectorizer.fit_transform(initial_labeled_data[text_column]).toarray()
y_initial = initial_labeled_data['predicted_label']

# Train initial logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_initial, y_initial)

# Function to calculate entropy
def calculate_entropy(probs):
    return stats.entropy(probs, axis=1)

# Active learning cycle
num_cycles = 5
samples_per_cycle = 3

for cycle in range(num_cycles):
    # Check if there are enough samples in unlabeled_data
    if len(unlabeled_data) == 0:
        print("No more samples to label.")
        break

    # Vectorize the unlabeled data
    X_unlabeled = vectorizer.transform(unlabeled_data[text_column]).toarray()
    
    # Predict probabilities for the unlabeled data
    probs = model.predict_proba(X_unlabeled)
    
    # Calculate confidence and entropy
    confidence = np.max(probs, axis=1)
    entropy = calculate_entropy(probs)
    
    # Select samples with lowest confidence or highest entropy
    selection_indices = np.argsort(confidence)[:samples_per_cycle]
    # selection_indices = np.argsort(entropy)[-samples_per_cycle:]
    
    # Add selected samples to labeled dataset
    selected_samples = unlabeled_data.iloc[selection_indices]
    X_selected = vectorizer.transform(selected_samples[text_column]).toarray()
    y_selected = selected_samples['predicted_label']
    
    # Remove selected samples from unlabeled dataset
    unlabeled_data = unlabeled_data.drop(selected_samples.index)
    
    # Update training data
    X_initial = np.vstack([X_initial, X_selected])
    y_initial = np.hstack([y_initial, y_selected])
    
    # Retrain the model with the updated training data
    model.fit(X_initial, y_initial)
    
    # Optionally, evaluate the model
    accuracy = accuracy_score(y_initial, model.predict(X_initial))
    print(f"Cycle {cycle + 1}/{num_cycles} - Training Accuracy: {accuracy:.4f}")

# Final model evaluation
print("Final model training complete.")


Cycle 1/5 - Training Accuracy: 0.7500
Cycle 2/5 - Training Accuracy: 0.7273
Cycle 3/5 - Training Accuracy: 0.7857
Cycle 4/5 - Training Accuracy: 0.7059
Cycle 5/5 - Training Accuracy: 0.7500
Final model training complete.
