In [1]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m256.0/410.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('data/ML/ML_train.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment
0,0,movie is funny suitable age is definitely fami...,6
1,1,old commercial blank audio cassette tag line w...,7
2,2,cinemascope color cinematography leon shamroy ...,4
3,3,get film possible will find really good perfor...,6
4,4,soundtrack is bit dated story is relevant yous...,6


In [5]:
df = df.drop(columns = ['Unnamed: 0'])

In [6]:
df.head()

Unnamed: 0,Text,Sentiment
0,movie is funny suitable age is definitely fami...,6
1,old commercial blank audio cassette tag line w...,7
2,cinemascope color cinematography leon shamroy ...,4
3,get film possible will find really good perfor...,6
4,soundtrack is bit dated story is relevant yous...,6


In [7]:
df.shape

(64784, 2)

Since we previously randomly oversampled the data, the oversampled data are just duplicates.

In [8]:
df = df.drop_duplicates()

In [9]:
df.shape

(39727, 2)

In [16]:
df.dtypes

Text         object
Sentiment     int64
dtype: object

In [17]:
import pandas as pd
import nlpaug.augmenter.word as naw

# Assuming you have your dataset loaded into a pandas DataFrame called 'df'
# 'Text' is the name of the column containing movie reviews text
# 'Sentiment' is the name of the column containing sentiment ratings

# Initialize augmenters for synonym replacement and text generation
aug_synonym = naw.SynonymAug(aug_src='wordnet')  # Use WordNet for synonym replacement
aug_text_generation = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")  # Use BERT for text generation

# Define augmentation ratios
synonym_aug_ratio = 0.3  # Ratio of augmented data to original data for synonym replacement
text_generation_aug_ratio = 0.4  # Ratio of augmented data to original data for text generation

# Initialize lists to store augmented data
augmented_data = []

# Perform augmentation
for index, row in df.iterrows():
    review = row['Text']
    sentiment = row['Sentiment']

    # Augment with synonym replacement
    augmented_reviews_synonym = aug_synonym.augment(review, n=int(len(review.split()) * synonym_aug_ratio))
    augmented_data.extend([(review_aug, sentiment) for review_aug in augmented_reviews_synonym])

    # Augment with text generation
    augmented_reviews_text_gen = aug_text_generation.augment(review, n=int(len(review.split()) * text_generation_aug_ratio))
    augmented_data.extend([(review_text_gen, sentiment) for review_text_gen in augmented_reviews_text_gen])

# Convert augmented data to a DataFrame
augmented_df = pd.DataFrame(augmented_data, columns=['Text', 'Sentiment'])

# Combine original and augmented data
balanced_df = pd.concat([df, augmented_df], ignore_index=True)

# Now, 'balanced_df' contains the original and augmented data for training your sentiment analysis model


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


KeyboardInterrupt: 

In [None]:
balanced_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Visualize the distribution of sentiments in the balanced DataFrame
plt.figure(figsize=(8, 6))
sns.countplot(data=balanced_df, x='Sentiment')
plt.title('Distribution of Sentiments in Balanced Dataset')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Export the balanced DataFrame as a CSV file
balanced_df.to_csv('balanced_movie_reviews.csv', index=False)
print("Balanced dataset exported as 'balanced_movie_reviews.csv'")