In [None]:
# Testing HuggingFace pipeline directly, doesn't work without logging in before starting kernel
import transformers
import torch
import time

model_id = "meta-llama/Meta-Llama-3-8B"

pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
pipeline("Hey how are you doing today?")

In [None]:
#importing libraries and reading the train dataset
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

import pandas as pd
import numpy as np
import warnings

from tqdm import tqdm

traindf = pd.read_csv('./technology_train.csv')

In [None]:
#Using LLaMa 3.2 Instruct with 3 Billion parameters for our experiments

import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:56<00:00, 28.50s/it]


In [None]:
import random

# Function to randomly select a sentence in paragraph
def select_random_line(paragraph):
    lines = paragraph.splitlines()
    lines = [line for line in lines if line.strip()]
    if lines:
        return random.choice(lines)
    else:
        return None

In [None]:
# Sample Input Text
traindf['text'][0]

'There are two traditional views of how Americans can be happiest in retirement. In one, they retrofit their homes with grab bars and wheelchair-accessible bathrooms and "age in place" while maintaining long-time community, church and family ties. In the other, they say farewell to friends and frigid winters and head off to Florida or Sun City for a life of leisurely golf and bridge games.\n\nBut a new study from the Center for Retirement Research at Boston College suggests there is another road that leads to happiness. Older Americans who move are somewhat happier and wealthier than those who stay put, but most of their moves are short-distance ones--some made as part of a retirement plan and some in response to a shock, such as being widowed or divorced, the study found.\n\nDespite their differing circumstances, both groups ended up with at least modest gains in happiness from the move. Those who moved after a shock were a little less unhappy than old folks in comparable circumstance

In [None]:
# Paraphrasing Sample Input Text in Technology Domain to Sprts Domain
messages = [
        {"role": "system", "content": "You are a sports reporter for ESPN, and you write about the latest sports from the world of sports. All of your content is completely new, and you should not write content you have ever read before."},
        {"role": "user", "content": "Write an imaginary 330 word article on sports, incorporating the following words: "+select_random_line(traindf['text'][0])}
    ]
outputs = pipe(
    messages,
    max_new_tokens=500,
)
op = outputs[0]["generated_text"][-1]['content']
op

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'"Unpacking the Great American Move: How Sports Can Relate to the Nation\'s Migration Patterns"\n\nIn a recent study, it was revealed that nearly a third of American homeowners made at least one move over the past 12 years, with a staggering 7% of them making multiple moves within any two-year span. This staggering data has left many to wonder: what\'s behind the great American move? As we delve into the world of sports, we\'ll explore some unexpected parallels between the nation\'s migration patterns and the world of athletics.\n\nOne striking similarity is the concept of "short-distance" moves, where 60% of all relocations were within a 20-mile radius. This phenomenon is eerily reminiscent of the "short-game" mentality in golf, where a well-placed chip shot or a precise bunker shot can make all the difference in a game. Just as a skilled golfer can make a significant impact on the course without leaving the fairway, many homeowners can find new opportunities and improve their quality

#### Paraphrasing to Food Target Domain

In [None]:
# Running a loop over two thousand samples for paraphrasing due to compute constraints
start = time.time()
ops = []
opssumm = []

for i in range(2000):
    print(f"{i}/2000")

    messages = [
        {"role": "system", "content": "You are a food reporter for TLC, and you write about the latest restaurants and food reviews and culinary events. All of your content is completely new, and you should not write content you have ever read before."},
        {"role": "user", "content": "Write a 330 word food or culinary news article, incorporating the following word: "+select_random_line(traindf['text'][i])},
    ]
    outputs = pipe(
        messages,
        max_new_tokens=500,
    )
    op = outputs[0]["generated_text"][-1]['content']
    ops.append(op)

    messagessumm = [
        {"role": "system", "content": "Summarize the given text in a single sentence under 20 words: "+op},
        {"role": "user", "content": "Summarize the given text in a single sentence under 20 words: "+op},
    ]
    outputssumm = pipe(
        messagessumm,
        max_new_tokens=25,
    )
    opsumm = outputssumm[0]["generated_text"][-1]['content']
    opssumm.append(opsumm)

end = time.time()
print(end - start)


In [None]:
dffood = pd.DataFrame({'text': ops, 'summary': opssumm})
dffood.to_csv("food-aug.csv")

#### Paraphrasing to Sports Target Domain

In [None]:
# Running a loop over two thousand samples for paraphrasing due to compute constraints
start = time.time()
ops = []
opssumm = []

for i in range(2000):
    print(f"{i}/2000")

    messages = [
        {"role": "system", "content": "You are a sports reporter for ESPN, and you write about the latest sports from the world of sports. All of your content is completely new, and you should not write content you have ever read before."},
        {"role": "user", "content": "Write an imaginary 330 word article on sports, incorporating the following words: "+select_random_line(traindf['text'][i])}
    ]
    outputs = pipe(
        messages,
        max_new_tokens=500,
    )
    op = outputs[0]["generated_text"][-1]['content']
    ops.append(op)

    messagessumm = [
        {"role": "system", "content": "Summarize the given text in a single sentence under 20 words: "+op},
        {"role": "user", "content": "Summarize the given text in a single sentence under 20 words: "+op},
    ]
    outputssumm = pipe(
        messagessumm,
        max_new_tokens=25,
    )
    opsumm = outputssumm[0]["generated_text"][-1]['content']
    opssumm.append(opsumm)

end = time.time()
print(end - start)


In [None]:
dfsports = pd.DataFrame({'text': ops, 'summary': opssumm})
dfsports.to_csv("sports-aug.csv")

#### Paraphrasing to Architecture Target Domain

In [None]:
# Running a loop over two thousand samples for paraphrasing due to compute constraints
start = time.time()
ops = []
opssumm = []

for i in range(2000):
    print(f"{i}/2000")

    messages = [
        {"role": "system", "content": "You are an expert architectural journalist who writes imaginative and engaging content about modern and classical architecture. Ensure all your content is original, creative, and avoids using information from existing sources."},
        {"role": "user", "content": "Write an imaginary 330-word article on architecture, incorporating the following words: " + select_random_line(traindf['text'][i])}
    ]
    outputs = pipe(
        messages,
        max_new_tokens=500,
    )
    op = outputs[0]["generated_text"][-1]['content']
    ops.append(op)

    messagessumm = [
        {"role": "system", "content": "Summarize the given text in a single sentence under 20 words: "+op},
        {"role": "user", "content": "Summarize the given text in a single sentence under 20 words: "+op},
    ]
    outputssumm = pipe(
        messagessumm,
        max_new_tokens=25,
    )
    opsumm = outputssumm[0]["generated_text"][-1]['content']
    opssumm.append(opsumm)

end = time.time()
print(end - start)


In [None]:
dfarch = pd.DataFrame({'text': ops, 'summary': opssumm})
dfarch.to_csv("arch-aug.csv")

#### Paraphrasing to Entertainment Target Domain

In [None]:
# Running a loop over two thousand samples for paraphrasing due to compute constraints
start = time.time()
ops = []
opssumm = []

for i in range(2000):
    print(f"{i}/2000")

    messages = [
        {"role": "system", "content": "You are an entertainment journalist who writes captivating and imaginative articles about movies, TV shows, celebrities, and pop culture. All your content should be original, creative, and not derived from existing sources."},
        {"role": "user", "content": "Write an imaginary 330-word article on entertainment, incorporating the following words: " + select_random_line(traindf['text'][i])}
    ]
    outputs = pipe(
        messages,
        max_new_tokens=500,
    )
    op = outputs[0]["generated_text"][-1]['content']
    ops.append(op)

    messagessumm = [
        {"role": "system", "content": "Summarize the given text in a single sentence under 20 words: "+op},
        {"role": "user", "content": "Summarize the given text in a single sentence under 20 words: "+op},
    ]
    outputssumm = pipe(
        messagessumm,
        max_new_tokens=25,
    )
    opsumm = outputssumm[0]["generated_text"][-1]['content']
    opssumm.append(opsumm)

end = time.time()
print(end - start)


In [None]:
dfent = pd.DataFrame({'text': ops, 'summary': opssumm})
dfent.to_csv("ent-aug.csv")