In [1]:
import pandas as pd
import re
import torch
from textblob import TextBlob
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW



In [2]:
# Load Data
email_data = pd.read_csv("/content/Gmail export.csv")
print(email_data.head())


                 ID            Thread                                   Date  \
0  18ad23ced22eecb4  18ad23ced22eecb4        Tue, 26 Sep 2023 16:05:22 +0000   
1  18ad239a7220b8b2  18ad239a7220b8b2          Tue, 26 Sep 2023 16:01:46 GMT   
2  18ad21dcd5f30755  18ad1afc743ff47b  Tue, 26 Sep 2023 21:01:21 +0530 (IST)   
3  18ad1e48122b4e59  18ad1e48122b4e59        Tue, 26 Sep 2023 14:28:47 +0000   
4  18ad1dbf2bb5bddb  18ad106f967770d3        Tue, 26 Sep 2023 07:19:26 -0700   

                                            From  \
0     Mary from Mailmeteor <mary@mailmeteor.com>   
1          Google <no-reply@accounts.google.com>   
2                          no-reply@mail.1mg.com   
3                   reverselogistics-jpr@1mg.com   
4  noreply-apps-scripts-notifications@google.com   

                                                  To  \
0                               rajat.tiwari@1mg.com   
1                               rajat.tiwari@1mg.com   
2                               rajat.

In [3]:
# Data Cleaning and Preprocessing
def clean_text(data):
    stopwords_list = [
        'the', 'to', 'is', 'Hi', 'for', '\u200c', 'Please', 'of', 'at', 'and', 'On', ',', '͏',
        'you', '2023', 'a', 'by', 'wrote:', 'Team,', 'your', '-', 'this', 'PM', 'in', 'with',
        'This', 'Dear', 'Name', ':', 'on', 'have', 'has', '&amp;', 'PRM', 'details', 'Google',
        'our', 'We', 'Code', 'Kumar', 'mail', 'Singh', 'Regards,', 'The', 'as', 'are', 'Item',
        '2022', 'script,', 'Store', 'initiated', '–', 'Rajat', 'number', 'from', '1mg', 'find',
        'All,', 'please', 'will', 'been', 'Your', 'we', 'that', 'invited', 'Team', 'Anubhav',
        'Tiwari', 'Join', 'Malik', 'be', 'us', 'Mon,', 'an', 'team', 'Thanks', 'or', 'attached',
        'below', 'You', 'shared', 'update', 'Meet', 'following', 'join', 'AM', 'not', 'A', 'very',
        'Happy', 'Wed,', 'To', 'For', 'Sep', 'below.', 'share', 'colleagues', 'Reason', 'wishing', 'India'
    ]

    if isinstance(data, str):
        data = data.lower()
        data = re.sub('re:', '', data)
        data = re.sub('-', '', data)
        data = re.sub('_', '', data)
        data = re.sub(r'\[[^]]*\]', '', data)
        data = re.sub(r'[^a-z\s]', '', data)
        data = ' '.join([word for word in data.split() if word not in stopwords_list])
        data = data.strip()
        return data
    return 'No Content'


In [4]:
email_data['cleaned_snippet'] = email_data['Snippet'].apply(clean_text)

# Sentiment Analysis using TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

email_data['sentiment'] = email_data['cleaned_snippet'].apply(get_sentiment)


In [5]:
# BERT Tokenization and Data Preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

input_ids = []
attention_masks = []

for snippet in email_data['cleaned_snippet']:
    encoded_data = tokenizer.encode_plus(
        snippet,
        add_special_tokens=True,
        max_length=256,
        truncation=True,
        padding='max_length',
        return_attention_mask=True
    )
    input_ids.append(encoded_data['input_ids'])
    attention_masks.append(encoded_data['attention_mask'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(email_data['sentiment'].replace({'Positive': 2, 'Neutral': 1, 'Negative': 0}).values)

dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [7]:


# Check GPU availability
if torch.cuda.is_available():
    device = 'cuda'
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = 'cpu'
    print("Using CPU")

# Modeling
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False
)
model.to(device)


Using GPU: Tesla T4


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
# Training & Evaluation
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

epochs = 3
for epoch in range(epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)
        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs[0]
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs} Finished")

print("Training Complete!")




Epoch 1/3 Finished
Epoch 2/3 Finished
Epoch 3/3 Finished
Training Complete!


In [9]:



def predict_sentiment(email_snippet):

    cleaned_snippet = clean_text(email_snippet)

    # Tokenize the cleaned snippet
    encoded_data = tokenizer.encode_plus(
        cleaned_snippet,
        add_special_tokens=True,
        max_length=256,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'  # Return PyTorch tensors
    )

    # Get input data
    input_ids = encoded_data['input_ids'].to(device)
    attention_masks = encoded_data['attention_mask'].to(device)

    # Put model in evaluation mode
    model.eval()

    # Predict
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)

    # Get the predicted label
    logits = outputs[0]
    predicted_label = torch.argmax(logits, dim=1).item()

    # Convert label to sentiment
    sentiment_mapping = {2: 'Positive', 1: 'Neutral', 0: 'Negative'}
    sentiment = sentiment_mapping[predicted_label]

    return sentiment


In [14]:
#function
email_snippet = "I hate this product its really mis leading and waste of money"
predicted_sentiment = predict_sentiment(email_snippet)
print(predicted_sentiment)


Negative


In [22]:
#function
email_snippet = "I hate this product its really mis leading and waste of money"
predicted_sentiment = predict_sentiment(email_snippet)
print(predicted_sentiment)


def generate_reply(email_snippet):
    # Predict the sentiment of the email
    sentiment = predict_sentiment(email_snippet)

    # Predefined replies for each sentiment
    replies = {
        'Positive': [
            "Thank you for your kind words! We're delighted to hear that you had a positive experience.",
            "We appreciate your positive feedback and look forward to serving you again.",
            "Thanks for sharing your thoughts with us. We're always here to help!"
        ],
        'Neutral': [
            "Thank you for reaching out to us. Let us know if there's any way we can assist you further.",
            "We've received your message. Please let us know if you have any questions.",
            "Thanks for getting in touch. Is there something specific you'd like to discuss?"
        ],
        'Negative': [
            "We're truly sorry for any inconvenience. Please let us know how we can make things right.",
            "Thank you for bringing this to our attention. We'll look into it and get back to you as soon as possible.",
            "We apologize for not meeting your expectations. Your feedback helps us improve. Please share more details about your experience."
        ]
    }

  # Select a random reply from the appropriate list based on sentiment
    import random
    reply = random.choice(replies[sentiment])

    return reply



In [13]:

# Test the function
email_snippet = "I wasn't very satisfied with the service."
generated_reply = generate_reply(email_snippet)
print(generated_reply)



Thanks for getting in touch. Is there something specific you'd like to discuss?


In [None]:

# Test the function
email_snippet = "I wasn't very satisfied with the service."
generated_reply = generate_reply(email_snippet)
print(generated_reply)


In [None]:
def gr_interface(email_snippet):
    return generate_reply(email_snippet)

iface = gr.Interface(
    fn=gr_interface,
    inputs=gr.inputs.Textbox(lines=5, placeholder="Enter Your Email Snippet Here..."),
    outputs="text",
    title="Email Response Generator",
    description="Predicts sentiment of your email snippet and generates a reply based on it."
)
if __name__ == "__main__":
    iface.launch()
