## Load Data

In [1]:
import pandas as pd

df = pd.read_csv("data.csv")
print(df.head())
print(df.describe())

                                            Sentence Sentiment
0  The GeoSolutions technology will leverage Bene...  positive
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2  For the last quarter of 2010 , Componenta 's n...  positive
3  According to the Finnish-Russian Chamber of Co...   neutral
4  The Swedish buyout firm has sold its remaining...   neutral
                                                 Sentence Sentiment
count                                                5842      5842
unique                                               5322         3
top     Net sales decreased to EUR 220.5 mn from EUR 4...   neutral
freq                                                    2      3130


## Data Preprocessing and Cleaning

In [3]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

df.dropna(subset=['Sentence'], inplace=True)

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    
    text = text.lower()
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    return " ".join(stemmed_tokens)

df['processed_sentence'] = df['Sentence'].apply(preprocess_text)
