In [1]:
# 1. Data Cleaning
# 2. EDA
# 3. Text Preprocessing
# 4. Model Building
# 5. Evaluation
# 6. Improvement depending on evaluation
# 7. Website
# 8. Deploy

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('Sentiment.csv')
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [4]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [5]:
df.sample(20)

Unnamed: 0,Sentence,Sentiment
3784,Cash flow from operations totalled EUR 2.71 mn...,positive
2058,Residents access to the block is planned to be...,neutral
5285,"$AAPL bounce right now has no vol, more vol wh...",positive
4570,Stora Enso 's third-quarter pre-tax profit dou...,positive
3397,Finnlines has six ships under construction in ...,neutral
3589,"SysOpen Digia Plc , Press release , 7 February...",positive
1153,"Last July , the group said it intended to relo...",neutral
2810,Central Europe is an important market area for...,neutral
990,Talvivaara Mining Company Plc Talvivaara Minin...,neutral
251,RBS will reportedly appoint Howard Davies as i...,neutral


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB


In [7]:
df.describe()

Unnamed: 0,Sentence,Sentiment
count,5842,5842
unique,5322,3
top,Managing Director 's comments : `` Net sales f...,neutral
freq,2,3130


In [8]:
df['Sentiment'].value_counts()

neutral     3130
positive    1852
negative     860
Name: Sentiment, dtype: int64

In [9]:
df.shape

(5842, 2)

# data cleaning

In [10]:
df = df[~df['Sentiment'].isin(['neutral'])].reset_index(drop=True)

In [11]:
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,$SPY wouldn't be surprised to see a green close,positive
4,Shell's $70 Billion BG Deal Meets Shareholder ...,negative
...,...,...
2707,Operating profit fell to EUR 38.1 mn from EUR ...,negative
2708,HSBC Says Unit to Book $585 Million Charge on ...,negative
2709,Daily Mail parent company in talks with potent...,positive
2710,RISING costs have forced packaging producer Hu...,negative


In [12]:
df.isnull().sum()
# df.dropna(inplace=True)         # Drop null values

Sentence     0
Sentiment    0
dtype: int64

In [13]:
# df.drop(columns=['Extra_Column1','Extra_Column2'],inplace=True)           # for drop extra columns

In [14]:
df.duplicated().sum()
# df.drop_duplicates(keep='first',inplace=True)            # To remove duplicates values

0

In [15]:
df.shape

(2712, 2)

In [16]:
df['Sentiment'].value_counts()

positive    1852
negative     860
Name: Sentiment, dtype: int64

In [17]:
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,$SPY wouldn't be surprised to see a green close,positive
4,Shell's $70 Billion BG Deal Meets Shareholder ...,negative
...,...,...
2707,Operating profit fell to EUR 38.1 mn from EUR ...,negative
2708,HSBC Says Unit to Book $585 Million Charge on ...,negative
2709,Daily Mail parent company in talks with potent...,positive
2710,RISING costs have forced packaging producer Hu...,negative


# Balancing the data

In [18]:
# select the positive data
positive = df[df['Sentiment']=='positive']
positive.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,$SPY wouldn't be surprised to see a green close,positive
6,Kone 's net sales rose by some 14 % year-on-ye...,positive
7,Circulation revenue has increased by 5 % in Fi...,positive


In [19]:
# select the negetive data
negative = df[df['Sentiment']=='negative']
negative.head()

Unnamed: 0,Sentence,Sentiment
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
4,Shell's $70 Billion BG Deal Meets Shareholder ...,negative
5,SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANG...,negative
8,$SAP Q1 disappoints as #software licenses down...,negative
19,$AAPL afternoon selloff as usual will be bruta...,negative


In [20]:
# checking the shape of both
positive.shape, negative.shape

((1852, 2), (860, 2))

In [21]:
negative.shape[0]
# Now we have to select 2254 samples from neutral to balance the data

860

In [22]:
positive = positive.sample(negative.shape[0])

In [23]:
positive.shape

(860, 2)

In [24]:
positive.shape, negative.shape

((860, 2), (860, 2))

In [25]:
# Append spam data into spam data
data = positive.append(negative, ignore_index=True)

  data = positive.append(negative, ignore_index=True)


In [26]:
data

Unnamed: 0,Sentence,Sentiment
0,`` The CHF is a great product .,positive
1,$MXWL Oh snap. Just went long again. LT,positive
2,"EBIT excluding non-recurring items , totalled ...",positive
3,According to Deputy MD Pekka Silvennoinen the ...,positive
4,$CRM Sep 40 calls are +35% since entry #BANG h...,positive
...,...,...
1715,"$SBUX down PM, from $DB downgrade.. PT cut fro...",negative
1716,Finnish developer and manufacturer of mobile p...,negative
1717,Operating profit fell to EUR 38.1 mn from EUR ...,negative
1718,HSBC Says Unit to Book $585 Million Charge on ...,negative


In [27]:
data.head()

Unnamed: 0,Sentence,Sentiment
0,`` The CHF is a great product .,positive
1,$MXWL Oh snap. Just went long again. LT,positive
2,"EBIT excluding non-recurring items , totalled ...",positive
3,According to Deputy MD Pekka Silvennoinen the ...,positive
4,$CRM Sep 40 calls are +35% since entry #BANG h...,positive


In [28]:
data.shape

(1720, 2)

# Data Preprocessing

In [29]:
#1. Lower case
#2. Tokenization
#3. Remove special characters
#4. Removing stop words and pucntuation
#5. Stemming

In [30]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# Initialize PorterStemmer and stop words
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))


In [31]:
def transform_text(text):
    text = text.lower()
    
    # Tokenize text
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:] 
    y.clear()   

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()   

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [32]:
data['Sentence']

0                         `` The CHF is a great product .
1                 $MXWL Oh snap. Just went long again. LT
2       EBIT excluding non-recurring items , totalled ...
3       According to Deputy MD Pekka Silvennoinen the ...
4       $CRM Sep 40 calls are +35% since entry #BANG h...
                              ...                        
1715    $SBUX down PM, from $DB downgrade.. PT cut fro...
1716    Finnish developer and manufacturer of mobile p...
1717    Operating profit fell to EUR 38.1 mn from EUR ...
1718    HSBC Says Unit to Book $585 Million Charge on ...
1719    RISING costs have forced packaging producer Hu...
Name: Sentence, Length: 1720, dtype: object

In [33]:
data['Sentence'].apply(transform_text)

0                                       chf great product
1                               mxwl oh snap went long lt
2                    ebit exclud item total eur mn eur mn
3       accord deputi md pekka silvennoinen aim doubl ...
4                    crm sep 40 call sinc entri bang http
                              ...                        
1715                     sbux pm db downgrad pt cut 70 64
1716    finnish develop manufactur mobil phone charger...
1717                  oper profit fell eur mn eur mn 2007
1718      hsbc say unit book 585 million charg settlement
1719    rise cost forc packag produc huhtamaki axe 90 ...
Name: Sentence, Length: 1720, dtype: object

# Split the data into train and text sets

In [34]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['Sentence'], data['Sentiment'], test_size=0.3, random_state=42)

from sklearn.pipeline import Pipeline
#there will be lot of repeated processes for training and testing the dataset separately, to avoid that we 
# are using pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
#we are importing Tfidfvectorizer to utilize bag of words model in sklearn
from sklearn.ensemble import RandomForestClassifier

In [35]:
classifier = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', RandomForestClassifier(n_estimators=100))])
classifier.fit(x_train, y_train)

In [36]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = classifier.predict(x_test)

# confusion matrix
confusion_matrix(y_test, y_pred)
confusion_matrix

#classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.82      0.70      0.75       260
    positive       0.73      0.84      0.78       256

    accuracy                           0.77       516
   macro avg       0.78      0.77      0.77       516
weighted avg       0.78      0.77      0.77       516



In [37]:
accuracy_score(y_test, y_pred)

0.7693798449612403

In [38]:
# predict a real message 
classifier.predict(['Some situations are beyond control.'])

array(['negative'], dtype=object)

In [39]:
# predict a real message 
classifier.predict(['Life is full of opportunities to grow.'])

array(['positive'], dtype=object)