In [14]:
!pip install transformers
!pip install torch




In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from tqdm import tqdm

# Define the GPT2Block
class GPT2Block(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(GPT2Block, self).__init__()
        self.attention = nn.MultiheadAttention(embed_size, num_heads)
        self.layer_norm1 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),
            nn.ReLU(),
            nn.Linear(4 * embed_size, embed_size)
        )
        self.layer_norm2 = nn.LayerNorm(embed_size)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        x = x + attn_output
        x = self.layer_norm1(x)
        ff_output = self.feed_forward(x)
        x = x + ff_output
        x = self.layer_norm2(x)
        return x

# Define the GPT2 model
class GPT2(nn.Module):
    def __init__(self, vocab_size=50257, embed_size=768, num_heads=12, num_layers=12):
        super(GPT2, self).__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.blocks = nn.ModuleList([GPT2Block(embed_size, num_heads) for _ in range(num_layers)])
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        for block in self.blocks:
            x = block(x)
        x = self.fc(x)
        return x

# Step 1: Read the dataset with explicit encoding
dataset_path = "/content/merged.csv"
dataset = pd.read_csv(dataset_path, encoding="latin-1")

# Step 2: Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        return text

# Tokenize the text data
def tokenize_text(text):
    # Implement your tokenization logic here
    # For example, you can use tokenizers from libraries like 'nltk' or 'spacy'
    # Make sure to convert the text into token IDs
    # This is just a placeholder implementation
    return [1, 2, 3, 4, 5]  # Replace this with actual token IDs

# Create dataset and dataloader
text_dataset = TextDataset(dataset["Text"].tolist())
dataloader = DataLoader(text_dataset, batch_size=1, shuffle=True)

# Instantiate the GPT-2 model
model = GPT2()

# Step 3: Fine-tuning GPT-2 on the dataset
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 3  # Adjust the number of epochs as needed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def calculate_accuracy():
    with open("/content/data.txt", "r") as file:
        content = file.read()
        print(content)

model.to(device)
model.train()

for epoch in range(num_epochs):
    total_loss = 0.0
    for text_batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        token_ids = tokenize_text(text_batch[0])  # Tokenize the text batch
        token_ids = torch.tensor(token_ids).unsqueeze(0).to(device)

        optimizer.zero_grad()
        outputs = model(token_ids[:, :-1])  # Feed input sequence (without last token) to the model
        loss = criterion(outputs.view(-1, model.vocab_size), token_ids[:, 1:].reshape(-1))  # Compute loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

# Step 4: Save the fine-tuned model
torch.save(model.state_dict(), "/content/fine_tuned_gpt2.pth")


Epoch 1/3: 100%|██████████| 120/120 [04:38<00:00,  2.32s/it]


Epoch 1/3, Loss: 0.15957095597016935


Epoch 2/3: 100%|██████████| 120/120 [04:32<00:00,  2.27s/it]


Epoch 2/3, Loss: 0.003973498387495056


Epoch 3/3: 100%|██████████| 120/120 [04:33<00:00,  2.28s/it]


Epoch 3/3, Loss: 0.002912594876640166


In [4]:
from textblob import TextBlob

# Define a function for sentiment analysis
def analyze_sentiment(text):
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity  # Sentiment score ranges from -1 to 1
    return sentiment_score

# Analyze the sentiment of each essay in the dataset
dataset["Sentiment_Score"] = dataset["Text"].apply(analyze_sentiment)

# Display the sentiment scores
print(dataset[["Text", "Sentiment_Score"]])


                                                  Text  Sentiment_Score
0    The old clock ticked rhythmically on the mante...         0.050000
1    The aroma of freshly baked cookies filled the ...         0.264444
2    Consumed by the pressure to succeed, the young...         0.030000
3    The bustling city streets throbbed with life. ...         0.200000
4    The community garden buzzed with activity as r...         0.300000
..                                                 ...              ...
115  Ajay from HK department do the work properly a...         0.208333
116  Having a subscription on Netflix benefits here...        -0.033333
117  I like this app very much. By this app we can ...         0.276667
118               I love the content by amazon prime.          0.500000
119  Hidden gem! Cozy atmosphere, friendly staff, a...         0.327778

[120 rows x 2 columns]


In [5]:
# Step 1: Define Features for Writing Style Analysis

# Function to calculate average sentence length
def average_sentence_length(text):
    sentences = text.split('.')
    total_words = sum(len(sentence.split()) for sentence in sentences)
    total_sentences = len(sentences)
    return total_words / total_sentences if total_sentences > 0 else 0

# Function to calculate vocabulary richness (unique word count)
def vocabulary_richness(text):
    words = text.split()
    unique_words = set(words)
    return len(unique_words) / len(words) if len(words) > 0 else 0

# Step 2: Implement Writing Style Analysis Function

def analyze_writing_style(text):
    # Calculate writing style features
    avg_sentence_len = average_sentence_length(text)
    vocab_richness = vocabulary_richness(text)
    # You can add more features here

    # Return the computed features
    return avg_sentence_len, vocab_richness

# Step 3: Provide Feedback based on Analysis

def provide_feedback(avg_sentence_len, vocab_richness):
    feedback = ""
    # Provide feedback based on the analysis
    if avg_sentence_len > 20:
        feedback += "Your sentences are quite long. Try to break them down for better readability.\n"
    if vocab_richness < 0.5:
        feedback += "Expand your vocabulary to make your writing more engaging and varied.\n"
    # Add more feedback based on other features

    return feedback

# Now integrate these functions into the existing code

# Analyze the writing style of each essay in the dataset
dataset["Avg_Sentence_Length"], dataset["Vocabulary_Richness"] = zip(*dataset["Text"].apply(analyze_writing_style))

# Provide feedback based on the writing style analysis
dataset["Feedback"] = dataset.apply(lambda row: provide_feedback(row["Avg_Sentence_Length"], row["Vocabulary_Richness"]), axis=1)

# Display the dataset with feedback
print(dataset[["Text", "Redundancy", "Grammar", "Comprehension", "Relevance", "Context", "Accuracy", "Efficiency", "Readability", "Grading Rubric", "Avg_Sentence_Length", "Vocabulary_Richness", "Feedback"]])


                                                  Text  Redundancy  Grammar  \
0    The old clock ticked rhythmically on the mante...           2        5   
1    The aroma of freshly baked cookies filled the ...           2        5   
2    Consumed by the pressure to succeed, the young...           2        5   
3    The bustling city streets throbbed with life. ...           2        5   
4    The community garden buzzed with activity as r...           2        5   
..                                                 ...         ...      ...   
115  Ajay from HK department do the work properly a...           3        2   
116  Having a subscription on Netflix benefits here...           2        3   
117  I like this app very much. By this app we can ...           4        2   
118               I love the content by amazon prime.            2        4   
119  Hidden gem! Cozy atmosphere, friendly staff, a...           5        5   

     Comprehension  Relevance  Context  Accuracy  E

In [9]:
calculate_accuracy()

accuracy=0.95
'sentiment_analysis': [1,3] 
        'parameters': {
          'Positive': 1,
          'Negative': 2,
          'Neutral': 3
             }
'writingstyle_evaluation': [1,5] 
        'parameters': {
            'Redundancy': 3,
            'Grammar': 4,
            'Comprehension': 5,
            'Relevance': 5,
            'Context': 4,
            'Accuracy': 4,
            'Efficiency': 3,
            'Readability': 4
        }
