In [6]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.2


In [7]:
!pip install tqdm



# Loading & Preprocessing scale_data Dataset

In [1]:
import os
import pandas as pd

base_path = '/kaggle/input/nlp-sentiment-analysis-project/scale_data/scaledata'

critics = ['Dennis+Schwartz', 'James+Berardinelli', 'Scott+Renshaw', 'Steve+Rhodes']

all_dfs = []

for critic in critics:
    critic_path = os.path.join(base_path, critic)
    
    id_df = pd.read_csv(os.path.join(critic_path, f'id.{critic}'), header=None, names=['id'])
    label_3class_df = pd.read_csv(os.path.join(critic_path, f'label.3class.{critic}'), header=None, names=['label_3class'])
    label_4class_df = pd.read_csv(os.path.join(critic_path, f'label.4class.{critic}'), header=None, names=['label_4class'])
    rating_df = pd.read_csv(os.path.join(critic_path, f'rating.{critic}'), header=None, names=['rating'])
    
    with open(os.path.join(critic_path, f'subj.{critic}'), 'r', encoding='utf-8') as f:
        subj_lines = f.read().splitlines()
    subj_df = pd.DataFrame(subj_lines, columns=['subjectivity'])
    
    critic_df = pd.concat([id_df, label_3class_df, label_4class_df, rating_df, subj_df], axis=1)
    critic_df['critic_name'] = critic

    all_dfs.append(critic_df)


full_df = pd.concat(all_dfs, ignore_index=True)

print("Shape of full dataset:", full_df.shape)
print("\n Columns:", full_df.columns.tolist())


print("\n First 5 rows:\n", full_df.head())


print("\n Value counts for 3-class labels:\n", full_df['label_3class'].value_counts())
print("\n Unique 3-class labels:", full_df['label_3class'].unique())

print("\n Value counts for 4-class labels:\n", full_df['label_4class'].value_counts())
print("\n Unique 4-class labels:", full_df['label_4class'].unique())


print("\n Missing values per column:\n", full_df.isnull().sum())


📌 Shape of full dataset: (5006, 6)

📌 Columns: ['id', 'label_3class', 'label_4class', 'rating', 'subjectivity', 'critic_name']

📌 First 5 rows:
       id  label_3class  label_4class  rating  \
0  29420             0             0     0.1   
1  17219             0             0     0.2   
2  18406             0             0     0.2   
3  18648             0             0     0.2   
4  20021             0             0     0.2   

                                        subjectivity      critic_name  
0  in my opinion , a movie reviewer's most import...  Dennis+Schwartz  
1  you can watch this movie , that is based on a ...  Dennis+Schwartz  
2  this is asking a lot to believe , and though i...  Dennis+Schwartz  
3  no heroes and no story are the main attributes...  Dennis+Schwartz  
4  this is not an art movie , yet i saw it an art...  Dennis+Schwartz  

📌 Value counts for 3-class labels:
 label_3class
1    1915
2    1894
0    1197
Name: count, dtype: int64

📌 Unique 3-class labels: [0

In [2]:
df = full_df[['subjectivity', 'label_3class']].copy()

df = df.rename(columns={
    'subjectivity': 'text',
    'label_3class': 'label'
})

df['label'] = df['label'].map(lambda x: 0 if x == 0 else 1)

print(f"✅ Final shape: {df.shape}")
print(f"\n✅ First 5 rows:\n{df.head()}")
print(f"\n✅ Label distribution:\n{df['label'].value_counts()}")


✅ Final shape: (5006, 2)

✅ First 5 rows:
                                                text  label
0  in my opinion , a movie reviewer's most import...      0
1  you can watch this movie , that is based on a ...      0
2  this is asking a lot to believe , and though i...      0
3  no heroes and no story are the main attributes...      0
4  this is not an art movie , yet i saw it an art...      0

✅ Label distribution:
label
1    3809
0    1197
Name: count, dtype: int64


# Loading & Preprocessing rt-polaritydata Dataset

In [3]:
neg_file = '/kaggle/input/nlp-sentiment-analysis-project/rt-polaritydata/rt-polaritydata/rt-polarity.neg'
pos_file = '/kaggle/input/nlp-sentiment-analysis-project/rt-polaritydata/rt-polaritydata/rt-polarity.pos'

with open(neg_file, 'r', encoding='latin-1') as f:
    neg_lines = f.readlines()

with open(pos_file, 'r', encoding='latin-1') as f:
    pos_lines = f.readlines()

df_rt_polarity = pd.DataFrame({
    'text': [line.strip() for line in neg_lines + pos_lines],
    'label': [0]*len(neg_lines) + [1]*len(pos_lines)
})

print(df_rt_polarity.head())
print(df_rt_polarity['label'].value_counts())


                                                text  label
0                   simplistic , silly and tedious .      0
1  it's so laddish and juvenile , only teenage bo...      0
2  exploitative and largely devoid of the depth o...      0
3  [garbus] discards the potential for pathologic...      0
4  a visually flashy but narratively opaque and e...      0
label
0    5331
1    5331
Name: count, dtype: int64


# Loading & Preprocessing review_polarity Dataset

In [4]:
import os
import pandas as pd

base_path = '/kaggle/input/nlp-sentiment-analysis-project/review_polarity/txt_sentoken'
neg_path = os.path.join(base_path, 'neg')
pos_path = os.path.join(base_path, 'pos')

neg_reviews = []
for file in os.listdir(neg_path):
    with open(os.path.join(neg_path, file), 'r', encoding='utf-8') as f:
        neg_reviews.append(f.read())

pos_reviews = []
for file in os.listdir(pos_path):
    with open(os.path.join(pos_path, file), 'r', encoding='utf-8') as f:
        pos_reviews.append(f.read())

df_txt_sentoken = pd.DataFrame({
    'text': neg_reviews + pos_reviews,
    'label': [0]*len(neg_reviews) + [1]*len(pos_reviews)
})

print(df_txt_sentoken.head())
print(df_txt_sentoken['label'].value_counts())


                                                text  label
0  words i thought i'd never write : the sequel t...      0
1   " tina ! ! ! fetch me the axe ! ! ! " \na fav...      0
2  hav plenty , as we are told in the beginning a...      0
3  the first scene of operation condor has jackie...      0
4  the title is taken from the writings of ralph ...      0
label
0    1000
1    1000
Name: count, dtype: int64


# Text Preprocessing for Sentiment Analysis

In [11]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import string
import re
from spellchecker import SpellChecker
import emoji

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()

slang_dict = {
    "gr8": "great",
    "luv": "love",
    "bff": "best friend",
    "omg": "oh my god",
    "ttyl": "talk to you later",
    "brb": "be right back",
    "idk": "i don't know",
    "smh": "shaking my head"
}

def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ, 
        'N': wordnet.NOUN, 
        'V': wordnet.VERB, 
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun if unknown

def handle_negations(text):
    negations = ["not", "no", "never", "none", "n't"]
    words = text.split()
    for i, word in enumerate(words):
        if word in negations and i + 1 < len(words):
            words[i + 1] = f"not_{words[i + 1]}"  # Prefix with 'not_'
    return " ".join(words)

def handle_emojis(text):
    return emoji.demojize(text)

def correct_spelling(text):
    words = text.split()
    corrected_words = []
    for word in words:
        # If the word is not in the stopwords list, correct the spelling
        if word not in stop_words:
            corrected_word = spell.correction(word)
            if corrected_word is not None:
                corrected_words.append(corrected_word)
            else:
                corrected_words.append(word)  # If no correction found, keep the word as is
        else:
            corrected_words.append(word)
    return " ".join(corrected_words)

def handle_slang(text):
    words = text.split()
    return " ".join([slang_dict.get(word, word) for word in words])

def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove HTML Tags
    text = re.sub(r'<.*?>', '', text)
    
    # 3. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # 4. Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # 5. Handle Emojis
    text = handle_emojis(text)
    
    # 6. Handle Negations
    text = handle_negations(text)
    
    # 7. Correct spelling
    #text = correct_spelling(text)
    
    # 8. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 9. Remove Hashtags and Mentions (for social media data)
    text = re.sub(r'@\w+|#\w+', '', text)  # Removes mentions and hashtags
    
    # 10. Handle Slang
    text = handle_slang(text)
    
    # 11. Tokenize
    tokens = word_tokenize(text)
    
    # 12. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # 13. Lemmatization with correct POS tag
    tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]
    
    return ' '.join(tokens)

from tqdm import tqdm

tqdm.pandas()

combined_df['clean_text'] = combined_df['text'].progress_apply(preprocess_text)
final_df = combined_df[['clean_text', 'label']]

print(final_df.head())



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 17668/17668 [02:03<00:00, 143.48it/s]

                                          clean_text  label
0                  film really not_so much bad bland      0
1  sometimes incisive sensitive portrait undercut...      0
2  kind nervous film either give mild headache ex...      1
3  falsehood pile undermine movie reality stifle ...      0
4  hoffman notch nuance pain smart edgy voice wad...      1





# Saving the final dataset

In [17]:
import pandas as pd
import os



save_folder = "cleaned_dataset__"
os.makedirs(save_folder, exist_ok=True)

final_df.to_csv(os.path.join(save_folder, "data.csv"), index=False)

print(f"Dataset saved locally inside '{save_folder}/data.csv'. You can upload it manually to Kaggle later.")


Dataset saved locally inside 'cleaned_dataset__/data.csv'. You can upload it manually to Kaggle later.


In [5]:
import pandas as pd
import os


full_df_binary = full_df[['subjectivity', 'label_3class']].copy()
full_df_binary.rename(columns={'subjectivity': 'text'}, inplace=True)

full_df_binary['label'] = full_df_binary['label_3class'].map(lambda x: 0 if x == 0 else 1)


full_df_binary.drop(columns=['label_3class'], inplace=True)

print(f"Processed full_df shape: {full_df_binary.shape}")



txt_sentoken_base = '/kaggle/input/nlp-sentiment-analysis-project/review_polarity/txt_sentoken'
neg_path = os.path.join(txt_sentoken_base, 'neg')
pos_path = os.path.join(txt_sentoken_base, 'pos')

neg_reviews = []
for file in os.listdir(neg_path):
    with open(os.path.join(neg_path, file), 'r', encoding='utf-8') as f:
        neg_reviews.append(f.read())

pos_reviews = []
for file in os.listdir(pos_path):
    with open(os.path.join(pos_path, file), 'r', encoding='utf-8') as f:
        pos_reviews.append(f.read())

df_txt_sentoken = pd.DataFrame({
    'text': neg_reviews + pos_reviews,
    'label': [0]*len(neg_reviews) + [1]*len(pos_reviews)
})

print(f"Processed txt_sentoken shape: {df_txt_sentoken.shape}")


neg_file = '/kaggle/input/nlp-sentiment-analysis-project/rt-polaritydata/rt-polaritydata/rt-polarity.neg'
pos_file = '/kaggle/input/nlp-sentiment-analysis-project/rt-polaritydata/rt-polaritydata/rt-polarity.pos'

with open(neg_file, 'r', encoding='latin-1') as f:
    neg_lines = f.readlines()

with open(pos_file, 'r', encoding='latin-1') as f:
    pos_lines = f.readlines()

df_rt_polarity = pd.DataFrame({
    'text': [line.strip() for line in neg_lines + pos_lines],
    'label': [0]*len(neg_lines) + [1]*len(pos_lines)
})

print(f"Processed rt_polaritydata shape: {df_rt_polarity.shape}")


combined_df = pd.concat([full_df_binary, df_txt_sentoken, df_rt_polarity], axis=0, ignore_index=True)

print(f"✅ Final combined dataset shape: {combined_df.shape}")

combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

combined_df.to_csv('final_sentiment_dataset.csv', index=False)

print("🎯 Dataset ready with two columns: text and label!")
print(combined_df.head())


Processed full_df shape: (5006, 2)
Processed txt_sentoken shape: (2000, 2)
Processed rt_polaritydata shape: (10662, 2)
✅ Final combined dataset shape: (17668, 2)
🎯 Dataset ready with two columns: text and label!
                                                text  label
0      the film is really not so much bad as bland .      0
1  a sometimes incisive and sensitive portrait th...      0
2  the kind of nervous film that will either give...      1
3  falsehoods pile up , undermining the movie's r...      0
4  hoffman notches in the nuances of pain , but h...      1


In [16]:
# Assuming your DataFrame is called `df`
value_counts = final_df['label'].value_counts()

print(value_counts)


label
1    10140
0     7528
Name: count, dtype: int64
