In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [None]:
from tqdm import tqdm
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
df = pd.read_csv('/content/IMDB Dataset.csv')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [None]:
df['label'] = (df['sentiment'] == 'positive').astype(int)

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis", device=0)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:
preds = []
for text in tqdm(df['review'].tolist()):
    result = sentiment_pipeline(text[:512])[0]
    preds.append(1 if result['label'] == 'POSITIVE' else 0)

  0%|          | 10/50000 [00:00<45:38, 18.26it/s] You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 50000/50000 [04:40<00:00, 178.52it/s]


In [None]:
y_true = df['label'].tolist()
accuracy = accuracy_score(y_true, preds)
f1 = f1_score(y_true, preds)

In [None]:
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_true, preds, target_names=['Negative', 'Positive']))

Accuracy: 0.8275
F1-score: 0.8228

Classification Report:
              precision    recall  f1-score   support

    Negative       0.81      0.85      0.83     25000
    Positive       0.85      0.80      0.82     25000

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000



In [None]:
test_texts = [
    "I love this movie so much!",
    "I do not like this at all",
    "This is the worst film I've ever seen",
    "Absolutely fantastic and thrilling!"
]

for text in test_texts:
    result = sentiment_pipeline(text)[0]
    print(text, " --> ", result)

I love this movie so much!  -->  {'label': 'POSITIVE', 'score': 0.9998809099197388}
I do not like this at all  -->  {'label': 'NEGATIVE', 'score': 0.9968847632408142}
This is the worst film I've ever seen  -->  {'label': 'NEGATIVE', 'score': 0.9997496008872986}
Absolutely fantastic and thrilling!  -->  {'label': 'POSITIVE', 'score': 0.9998816251754761}


In [None]:
import gradio as gr
from transformers import pipeline

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


In [None]:
def predict_sentiment(text):
    result = sentiment_pipeline(text)[0]
    return f"{result['label']}"

In [None]:
with gr.Blocks(theme=gr.themes.Soft()) as interface:
    gr.Markdown(
        """
        <h1 style='text-align: center;'>Sentiment Analysis App</h1>
        <p style='text-align: center;'>Analyze the sentiment of any review or short text. The model will classify it as <strong>Positive</strong> or <strong>Negative</strong>.</p>
        """,
    )

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Enter your text",
                placeholder="Type your review here...",
                lines=4
            )
            submit_btn = gr.Button("Analyze")

        with gr.Column():
            output_label = gr.Textbox(
                label="Prediction",
                interactive=False
            )

    submit_btn.click(predict_sentiment, inputs=input_text, outputs=output_label)

interface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://416a86cdfcc645035c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


