# Data Augmentation

Here we balance out the datasets using https://groq.com/ free API.

In [1]:
import os
import pandas as pd
from groq import Groq
import time

In [2]:
# Set up your Groq client
client = Groq(api_key=os.getenv('GROQ_API_KEY'))
print(client)

<groq.Groq object at 0x000001D1BE7AA650>


In [3]:
# import DataFrame

df = pd.read_csv("D:\Data\PyCharmProjects\MAS-sentiment-analysis\data/financial_news.csv", 
                   names=['sentiment', 'news'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  4846 non-null   object
 1   news       4846 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


In [4]:
df.sentiment.value_counts()

sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64

In [7]:
# Number of rows we want for label 0
target_rows_label_0 = len(df[df.sentiment == 'negative']) * 4 # 604

# Find underrepresented rows (label == 0)
underrepresented_texts = df[df['sentiment'] == 'negative']['news'].tolist()

# Number of examples we currently have for label 0
current_rows_label_0 = len(underrepresented_texts)

# Number of additional examples we need
needed_examples = target_rows_label_0 - current_rows_label_0

In [10]:
underrepresented_texts[:10]

['The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .',
 'A tinyurl link takes users to a scamming site promising that users can earn thousands of dollars by becoming a Google ( NASDAQ : GOOG ) Cash advertiser .',
 'Compared with the FTSE 100 index , which rose 36.7 points ( or 0.6 % ) on the day , this was a relative price change of -0.2 % .',
 'Compared with the FTSE 100 index , which rose 94.9 points ( or 1.6 % ) on the day , this was a relative price change of -0.4 % .',
 'One of the challenges in the oil production in the North Sea is scale formation that can plug pipelines and halt production .',
 'Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars .',
 'Peer Peugeot fell 0.81 pct as its sales rose only 6.3 pct from the same period last yea

In [11]:
# Augment the underrepresented class with new examples
augmented_texts = []
for i in range(needed_examples):
    # Select a random text from the underrepresented class to augment
    text = underrepresented_texts[i % current_rows_label_0]  # Cycle through available texts if needed
    
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a data augmentation assistant."},
            {"role": "user", "content": f"Generate a headline similar to: {text} and reply with response only without quotes"},
        ],
        model="llama3-8b-8192"
    )
    
    # Get the augmented text from the response
    augmented_data = response.choices[0].message.content
    augmented_texts.append(augmented_data)

                                                   news sentiment
0     Global IT giant undertakes unexpected downsizi...  negative
1     Mysterious Online Platform Guarantees Quick Ri...  negative
2     Here is the generated headline:\n\nCompared wi...  negative
3     Here is a generated headline:\n\nCompared with...  negative
4     One of the key issues affecting offshore gas e...  negative
...                                                 ...       ...
1807  AMSTERDAM Bloomberg - Shares in Damen Shiprepa...  negative
1808  LONDON MarketWatch -- U.S. Equities Index Gain...  negative
1809  Net Loss Widens to EUR 32.5 mn as Reduced Vess...  negative
1810  Quarterly Net Sales of Paper Segment Dip to EU...  negative
1811  Industrial Production in Italy Falls by 12.8% ...  negative

[1812 rows x 2 columns]


In [17]:
# Create a new DataFrame for the augmented data
augmented_df = pd.DataFrame({
    'news': augmented_texts,
    'sentiment': ['negative'] * needed_examples  # Label the new examples as 0
})

# # Combine the original and augmented DataFrames
balanced_df = pd.concat([df, augmented_df], ignore_index=True)

# Display the balanced DataFrame
# print(balanced_df)
print(augmented_df)

                                                   news sentiment
0     Global IT giant undertakes unexpected downsizi...  negative
1     Mysterious Online Platform Guarantees Quick Ri...  negative
2     Here is the generated headline:\n\nCompared wi...  negative
3     Here is a generated headline:\n\nCompared with...  negative
4     One of the key issues affecting offshore gas e...  negative
...                                                 ...       ...
1807  AMSTERDAM Bloomberg - Shares in Damen Shiprepa...  negative
1808  LONDON MarketWatch -- U.S. Equities Index Gain...  negative
1809  Net Loss Widens to EUR 32.5 mn as Reduced Vess...  negative
1810  Quarterly Net Sales of Paper Segment Dip to EU...  negative
1811  Industrial Production in Italy Falls by 12.8% ...  negative

[1812 rows x 2 columns]


In [18]:
print(augmented_df.iloc[3].news)

Here is a generated headline:

Compared with the S&P 500 index, which fell 15.2 points (or 0.6%) on the day, this was a relative price change of 2.1%.


In [19]:
balanced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6658 entries, 0 to 6657
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  6658 non-null   object
 1   news       6658 non-null   object
dtypes: object(2)
memory usage: 104.2+ KB


In [21]:
balanced_df.sentiment.value_counts()

sentiment
neutral     2879
negative    2416
positive    1363
Name: count, dtype: int64

In [22]:
balanced_df.to_csv('D:\Data\PyCharmProjects\MAS-sentiment-analysis\data/balanced_negative.csv', index=False)

Now we build the above into a function and do the same for 'positive' sentiment news.



In [7]:
balanced_df = pd.read_csv('D:\Data\PyCharmProjects\MAS-sentiment-analysis\data/balanced_negative.csv')

In [8]:
balanced_df.sentiment.value_counts()

sentiment
neutral     2879
negative    2416
positive    1363
Name: count, dtype: int64

In [11]:
df2 = balanced_df.copy()

In [12]:
import groq_augment

In [13]:
start_time = time.time()

balanced_df_pos, augmented_df_pos = groq_augment.augment_sentiment(df2, target_ratio=2, sentiment_class='positive')

end_time = time.time()
print(f"Time taken: {round((end_time - start_time)/60, 2)} minutes")

Time taken: 58.64 minutes


In [14]:
balanced_df_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8021 entries, 0 to 8020
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  8021 non-null   object
 1   news       8021 non-null   object
dtypes: object(2)
memory usage: 125.5+ KB


In [15]:
type(balanced_df_pos)

pandas.core.frame.DataFrame

In [16]:
augmented_df_pos

Unnamed: 0,news,sentiment
0,"Factory Expansion Accelerates Production, Enha...",positive
1,Finextra: Swedish fintech company sets ambitio...,positive
2,Electronics Component Manufacturer Sets Sight ...,positive
3,"Componenta Reports Significant Growth in 2022,...",positive
4,"In the latest fiscal period, revenue expanded ...",positive
...,...,...
1358,The newly developed residential complex will b...,positive
1359,"The oral medication, administered at doses up ...",positive
1360,Nordea B-S NORdea 130.2 DKR improved 2.9% from...,positive
1361,Staying ahead of the curve through our dedicat...,positive


In [18]:
augmented_df_pos.iloc[7].news

'Here is a generated headline:\n\nVeon Ventures announces strategic investment to enhance mobile services, aligning with its core business expansion strategy.'

In [19]:
balanced_df_pos.sentiment.value_counts()

sentiment
neutral     2879
positive    2726
negative    2416
Name: count, dtype: int64

In [25]:
balanced_df_pos.to_csv('D:\Data\PyCharmProjects\MAS-sentiment-analysis\data/balanced_negative_positive.csv', index=False)