## Load Data

In [1]:
import pandas as pd

df = pd.read_csv("data.csv")
print(df.head())
print(df.describe())

                                            Sentence Sentiment
0  The GeoSolutions technology will leverage Bene...  positive
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2  For the last quarter of 2010 , Componenta 's n...  positive
3  According to the Finnish-Russian Chamber of Co...   neutral
4  The Swedish buyout firm has sold its remaining...   neutral
                                                 Sentence Sentiment
count                                                5842      5842
unique                                               5322         3
top     Net sales decreased to EUR 220.5 mn from EUR 4...   neutral
freq                                                    2      3130


## Data Preprocessing and Cleaning

In [3]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

df.dropna(subset=['Sentence'], inplace=True)

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    
    text = text.lower()
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    return " ".join(stemmed_tokens)

df['processed_sentence'] = df['Sentence'].apply(preprocess_text)


## Feature Extraction

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['processed_sentence'])

## Sentiment Analysis

In [17]:
from textblob import TextBlob

def get_sentiment(text):
    analysis = TextBlob(text)
    
    return analysis.sentiment.polarity

df['predicted_score'] = df['processed_sentence'].apply(get_sentiment)

def get_sentiment_label(score):
    if score > 0.1:
        return 'positive'
    elif score < -0.1:
        return 'negative'
    else:
        return 'neutral'

df['predicted_label'] = df['predicted_score'].apply(get_sentiment_label)

In [18]:
print(df.head())

                                            Sentence Sentiment  \
0  The GeoSolutions technology will leverage Bene...  positive   
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative   
2  For the last quarter of 2010 , Componenta 's n...  positive   
3  According to the Finnish-Russian Chamber of Co...   neutral   
4  The Swedish buyout firm has sold its remaining...   neutral   

                                  processed_sentence  predicted_score  \
0  geosolut technolog leverag benefon gp solut pr...        -0.331818   
1                            esi low bk real possibl         0.100000   
2  last quarter componenta net sale doubl eurm eu...         0.000000   
3  accord finnishrussian chamber commerc major co...         0.062500   
4  swedish buyout firm sold remain percent stake ...        -0.100000   

  predicted_label  
0        negative  
1         neutral  
2         neutral  
3         neutral  
4         neutral  


In [19]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(df['Sentiment'], df['predicted_label'])

print(accuracy)

0.5095857583019514


In [20]:
from sklearn.metrics import classification_report

report = classification_report(df['Sentiment'], df['predicted_label'])

print(report)

              precision    recall  f1-score   support

    negative       0.19      0.08      0.11       860
     neutral       0.55      0.78      0.65      3130
    positive       0.44      0.25      0.32      1852

    accuracy                           0.51      5842
   macro avg       0.39      0.37      0.36      5842
weighted avg       0.46      0.51      0.46      5842

