In [15]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [16]:
data = pd.read_csv('../Data/all-data.csv', encoding='ISO-8859-1', names=['Sentiment', "News"])

In [17]:
data

Unnamed: 0,Sentiment,News
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [None]:
######## STEPS ######### 
# 1. Tokenization and Lowering - Done
# 2. Stopword Removal + Stemming / Lemmatization - Done
# 3. Train Test Split - Done
# 4. Word2Vec Training on X_train - Done
# 5. AvgWord2Vec on X_train and X_test - Done
# 6. Baseline modelling - Logistic Regression - Done
# 7. Perform Modelling on List of models - Find Best one
# 8. Perform Crossvalidation to validate consistency
# 9. Perform Hypertuning for accurate results

In [19]:
import re

data['News'] = data['News'].apply(lambda x : re.sub(r'[^a-zA-Z0-9]', ' ', x))

In [20]:
data['News'] = data['News'].str.lower()

In [21]:
from nltk.tokenize import word_tokenize

data['News'] = data['News'].apply(lambda x : word_tokenize(x))

In [22]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_word_dict = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def stop_lemma(x, stop_word_dict, lemmatizer):
    result = [lemmatizer.lemmatize(word) for word in x if word not in stop_word_dict]
    return result

data['News'] = data['News'].apply(lambda x : stop_lemma(x, stop_word_dict, lemmatizer))

In [24]:
data['Sentiment'] = data['Sentiment'].map({'neutral' : 0, 'positive' : 1, 'negative' : -1})

In [236]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['News'], data['Sentiment'], test_size=0.2, stratify=data['Sentiment'], random_state=42)

In [237]:
from gensim.models import Word2Vec

model = Word2Vec(X_train.tolist(), min_count=1, sg=1, vector_size=600, window=2)

In [238]:
def AvgWord2Vec(model, words):
    
	vectors = [model.wv[word] for word in words if word in model.wv]

	if len(vectors) == 0:
		return np.zeros(model.vector_size)
	else:
		return np.mean(vectors, axis=0)


In [239]:
X_test = X_test.apply(lambda x : AvgWord2Vec(model, x))
X_train = X_train.apply(lambda x: AvgWord2Vec(model, x))

In [240]:
X_test = X_test.reset_index(drop=1)
X_train = X_train.reset_index(drop=1)

In [241]:
X_train = pd.DataFrame(X_train.tolist())
X_test = pd.DataFrame(X_test.tolist())

In [242]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced')

model.fit(X_train, y_train)

In [243]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,590,591,592,593,594,595,596,597,598,599
0,0.086994,0.003112,0.060419,0.107827,0.015891,-0.053303,0.015671,0.157400,-0.000550,-0.013291,...,0.067567,0.014031,0.083730,-0.051580,0.058685,0.088639,-0.049269,-0.117671,0.022796,-0.044851
1,0.084314,0.002017,0.056924,0.102776,0.014346,-0.048562,0.012855,0.149453,0.002110,-0.013974,...,0.063780,0.011374,0.080100,-0.052085,0.055614,0.085410,-0.049008,-0.111905,0.021474,-0.044230
2,0.073395,0.005208,0.028690,0.067343,0.013578,0.001604,-0.000061,0.106923,0.014545,-0.019526,...,0.051584,-0.010209,0.055137,-0.036361,0.038917,0.051766,-0.044436,-0.073624,0.003473,-0.033771
3,0.076986,0.005415,0.030422,0.072123,0.014241,0.001934,-0.000129,0.113377,0.015599,-0.020379,...,0.053817,-0.010964,0.058174,-0.038377,0.040904,0.054518,-0.046848,-0.078003,0.003993,-0.035707
4,0.090677,-0.005486,0.096206,0.146692,0.011542,-0.126077,0.038156,0.200143,-0.024547,0.002242,...,0.075668,0.049308,0.111450,-0.063286,0.079325,0.130580,-0.046595,-0.156516,0.042795,-0.050409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,0.078422,0.006102,0.030653,0.071478,0.014725,0.003005,-0.000961,0.114144,0.015787,-0.021954,...,0.055216,-0.011684,0.058164,-0.038679,0.041482,0.054217,-0.047617,-0.079373,0.003757,-0.035135
966,0.072975,0.004543,0.031493,0.068948,0.013511,-0.003143,0.001705,0.109893,0.011978,-0.018648,...,0.051242,-0.006683,0.057235,-0.036844,0.039941,0.054489,-0.043026,-0.076702,0.005543,-0.033911
967,0.089401,0.001812,0.073805,0.123288,0.012973,-0.082039,0.023038,0.172931,-0.011393,-0.008312,...,0.071989,0.027007,0.097225,-0.055304,0.064711,0.106658,-0.052177,-0.134039,0.032716,-0.049044
968,0.063725,0.003781,0.025194,0.058561,0.012502,0.001535,0.000504,0.093174,0.012194,-0.016879,...,0.044470,-0.008118,0.047134,-0.030931,0.033688,0.045254,-0.038189,-0.065060,0.003582,-0.029767


In [220]:
y_pred = model.predict(X_test)

In [221]:
from sklearn.metrics import classification_report

In [208]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.31      0.54      0.40       121
           0       0.69      0.76      0.73       576
           1       0.32      0.15      0.20       273

    accuracy                           0.56       970
   macro avg       0.44      0.48      0.44       970
weighted avg       0.54      0.56      0.54       970



In [222]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.31      0.53      0.39       121
           0       0.69      0.75      0.72       576
           1       0.30      0.15      0.20       273

    accuracy                           0.55       970
   macro avg       0.43      0.48      0.44       970
weighted avg       0.53      0.55      0.53       970

