In [3]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np 
import pandas as pd
from collections import Counter

### Original Data + mapped label

In [4]:
df = pd.read_csv("../Dataset/df_drop.csv", encoding="ISO-8859-1") # heading = none
#df = df.set_axis(['sentiment', 'news'], axis=1, inplace=False) # for all-data dataset

In [22]:
np.unique(df['sentiment'], return_counts=True),

((array(['negative', 'neutral', 'positive'], dtype=object),
  array([ 604, 2873, 1363], dtype=int64)),)

In [23]:
# MAP sentiment
label = {"positive": 2, "neutral": 1, "negative": 0}
df["sentiment"]=df["sentiment"].map(label)

### CLEANING Data

In [46]:
#import re

In [47]:
#Remove Punctuation & Emoji
for index, row  in enumerate(df.news):
    df["news"][index] = re.sub(r'[^\w\s]', '', row) # punctuation

In [48]:
df

Unnamed: 0,sentiment,news
0,1,According to Gran the company has no plans to...
1,1,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company s updated strategy fo...
...,...,...
4835,0,LONDON MarketWatch Share prices ended lower i...
4836,1,Rinkuskiai s beer sales fell by 65 per cent to...
4837,0,Operating profit fell to EUR 354 mn from EUR 6...
4838,0,Net sales of the Paper segment decreased to EU...


In [49]:
df.to_csv("../../Dataset/df_label.csv", index=False)

### Random Undersampling - df_rus

In [29]:
from imblearn.under_sampling import RandomUnderSampler

In [50]:
df = pd.read_csv("../../Dataset/df_label.csv", encoding="ISO-8859-1")

In [51]:
df.head()

Unnamed: 0,sentiment,news
0,1,According to Gran the company has no plans to...
1,1,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,2,With the new production plant the company woul...
4,2,According to the company s updated strategy fo...


In [52]:
# Splitting the training set in X and y
X_df = df.copy()
y_df = df['sentiment']
X_df = X_df.drop(columns='sentiment')

In [53]:
rus = RandomUnderSampler(random_state=42)
X_df_rus, y_df_rus = rus.fit_resample(X_df, y_df)

In [54]:
print('Original dataset shape %s' % Counter(y_df))
print('Resampled dataset shape %s' % Counter(y_df_rus))

Original dataset shape Counter({1: 2873, 2: 1363, 0: 604})
Resampled dataset shape Counter({0: 604, 1: 604, 2: 604})


In [55]:
df_rus = X_df_rus
df_rus['sentiment'] = y_df_rus

In [56]:
df_rus['sentiment'].value_counts()

0    604
1    604
2    604
Name: sentiment, dtype: int64

In [57]:
df_rus.to_csv("../../Dataset/df_rus.csv", index=False)

### Undersampling with % - df_under

In [58]:
df.sentiment.value_counts()

1    2873
2    1363
0     604
Name: sentiment, dtype: int64

In [59]:
# {2: 2873, 1: 1363, 0: 604}
print('Negative class ratio: ',   604/(2873+1363+604))
print('Positive class ratio: ',  1363/(2873+1363+604))
print('Neutral class ratio: ',   2879/(2873+1363+604))
print('0.35 of Neutral class: ',  int(2873*0.35))
print('0.55 of Positive class: ', int(1363*0.55))
print('1.0 of Negative class: ',  int(604))

Negative class ratio:  0.12479338842975207
Positive class ratio:  0.2816115702479339
Neutral class ratio:  0.5948347107438017
0.35 of Neutral class:  1005
0.55 of Positive class:  749
1.0 of Negative class:  604


In [60]:
rus = RandomUnderSampler(
    sampling_strategy={
        0: 604,
        1: 749,
        2: 1005
    }, 
    random_state=42)
X_under, y_under = rus.fit_resample(X_df, y_df)
print('Resampled dataset shape %s' % Counter(y_under))

Resampled dataset shape Counter({2: 1005, 1: 749, 0: 604})


In [61]:
print('Negative class ratio: ', 604/(1005+749+604))
print('Positive class ratio: ', 749/(1005+749+604))
print('Neutral class ratio: ', 1005/(1005+749+604))

Negative class ratio:  0.2561492790500424
Positive class ratio:  0.3176420695504665
Neutral class ratio:  0.4262086513994911


In [62]:
df_under = X_under
df_under['sentiment'] = y_under

In [63]:
df_under['sentiment'].value_counts()

2    1005
1     749
0     604
Name: sentiment, dtype: int64

In [64]:
df_under.to_csv("../../Dataset/df_under.csv", index=False)