# import libraries and reading data

In [None]:
import numpy as np
import pandas as pd

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
data = pd.read_csv("all-data.csv",delimiter=',',encoding='latin-1', header=None, names=['Sentiment', 'Message'])
data

Unnamed: 0,Sentiment,Message
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [None]:
! wget https://nlp.stanford.edu/data/glove.6B.zip

--2023-04-24 17:04:17--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-04-24 17:04:18--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2023-04-24 17:06:57 (5.17 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [None]:
!unzip glove*.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


# preprocessing

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
#library that contains punctuation
import string
string.punctuation

#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#storing the puntuation free text
data['clean_msg']= data['Message'].apply(lambda x:remove_punctuation(x))
data.head()

Unnamed: 0,Sentiment,Message,clean_msg
0,neutral,"According to Gran , the company has no plans t...",According to Gran the company has no plans to...
1,neutral,Technopolis plans to develop in stages an area...,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...,The international electronic industry company ...
3,positive,With the new production plant the company woul...,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...,According to the company s updated strategy fo...


In [None]:
# lowering text
data['msg_lower']= data['clean_msg'].apply(lambda x: x.lower())
data.head()

Unnamed: 0,Sentiment,Message,clean_msg,msg_lower
0,neutral,"According to Gran , the company has no plans t...",According to Gran the company has no plans to...,according to gran the company has no plans to...
1,neutral,Technopolis plans to develop in stages an area...,Technopolis plans to develop in stages an area...,technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...,The international electronic industry company ...,the international electronic industry company ...
3,positive,With the new production plant the company woul...,With the new production plant the company woul...,with the new production plant the company woul...
4,positive,According to the company 's updated strategy f...,According to the company s updated strategy fo...,according to the company s updated strategy fo...


In [None]:
# tokenization
#defining function for tokenization
import re
def tokenization(text):
    tokens = text.split()
    return tokens
#applying function to the column
data['msg_tokenied']= data['msg_lower'].apply(lambda x: tokenization(x))
data.head()

Unnamed: 0,Sentiment,Message,clean_msg,msg_lower,msg_tokenied
0,neutral,"According to Gran , the company has no plans t...",According to Gran the company has no plans to...,according to gran the company has no plans to...,"[according, to, gran, the, company, has, no, p..."
1,neutral,Technopolis plans to develop in stages an area...,Technopolis plans to develop in stages an area...,technopolis plans to develop in stages an area...,"[technopolis, plans, to, develop, in, stages, ..."
2,negative,The international electronic industry company ...,The international electronic industry company ...,the international electronic industry company ...,"[the, international, electronic, industry, com..."
3,positive,With the new production plant the company woul...,With the new production plant the company woul...,with the new production plant the company woul...,"[with, the, new, production, plant, the, compa..."
4,positive,According to the company 's updated strategy f...,According to the company s updated strategy fo...,according to the company s updated strategy fo...,"[according, to, the, company, s, updated, stra..."


In [None]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  
  return lemm_text
  
data['msg_lemmatized']=data['msg_tokenied'].apply(lambda x:lemmatizer(x))

In [None]:
data

Unnamed: 0,Sentiment,Message,clean_msg,msg_lower,msg_tokenied,msg_lemmatized
0,neutral,"According to Gran , the company has no plans t...",According to Gran the company has no plans to...,according to gran the company has no plans to...,"[according, to, gran, the, company, has, no, p...","[according, to, gran, the, company, ha, no, pl..."
1,neutral,Technopolis plans to develop in stages an area...,Technopolis plans to develop in stages an area...,technopolis plans to develop in stages an area...,"[technopolis, plans, to, develop, in, stages, ...","[technopolis, plan, to, develop, in, stage, an..."
2,negative,The international electronic industry company ...,The international electronic industry company ...,the international electronic industry company ...,"[the, international, electronic, industry, com...","[the, international, electronic, industry, com..."
3,positive,With the new production plant the company woul...,With the new production plant the company woul...,with the new production plant the company woul...,"[with, the, new, production, plant, the, compa...","[with, the, new, production, plant, the, compa..."
4,positive,According to the company 's updated strategy f...,According to the company s updated strategy fo...,according to the company s updated strategy fo...,"[according, to, the, company, s, updated, stra...","[according, to, the, company, s, updated, stra..."
...,...,...,...,...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...,LONDON MarketWatch Share prices ended lower i...,london marketwatch share prices ended lower i...,"[london, marketwatch, share, prices, ended, lo...","[london, marketwatch, share, price, ended, low..."
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,Rinkuskiai s beer sales fell by 65 per cent to...,rinkuskiai s beer sales fell by 65 per cent to...,"[rinkuskiai, s, beer, sales, fell, by, 65, per...","[rinkuskiai, s, beer, sale, fell, by, 65, per,..."
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...,Operating profit fell to EUR 354 mn from EUR 6...,operating profit fell to eur 354 mn from eur 6...,"[operating, profit, fell, to, eur, 354, mn, fr...","[operating, profit, fell, to, eur, 354, mn, fr..."
4844,negative,Net sales of the Paper segment decreased to EU...,Net sales of the Paper segment decreased to EU...,net sales of the paper segment decreased to eu...,"[net, sales, of, the, paper, segment, decrease...","[net, sale, of, the, paper, segment, decreased..."


In [None]:
nltk.download('stopwords')
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

#applying the function
data['no_stopwords']= data['msg_lemmatized'].apply(lambda x:remove_stopwords(x))
data.head()

Unnamed: 0,Sentiment,Message,clean_msg,msg_lower,msg_tokenied,msg_lemmatized,no_stopwords
0,neutral,"According to Gran , the company has no plans t...",According to Gran the company has no plans to...,according to gran the company has no plans to...,"[according, to, gran, the, company, has, no, p...","[according, to, gran, the, company, ha, no, pl...","[according, gran, company, ha, plan, move, pro..."
1,neutral,Technopolis plans to develop in stages an area...,Technopolis plans to develop in stages an area...,technopolis plans to develop in stages an area...,"[technopolis, plans, to, develop, in, stages, ...","[technopolis, plan, to, develop, in, stage, an...","[technopolis, plan, develop, stage, area, le, ..."
2,negative,The international electronic industry company ...,The international electronic industry company ...,the international electronic industry company ...,"[the, international, electronic, industry, com...","[the, international, electronic, industry, com...","[international, electronic, industry, company,..."
3,positive,With the new production plant the company woul...,With the new production plant the company woul...,with the new production plant the company woul...,"[with, the, new, production, plant, the, compa...","[with, the, new, production, plant, the, compa...","[new, production, plant, company, would, incre..."
4,positive,According to the company 's updated strategy f...,According to the company s updated strategy fo...,according to the company s updated strategy fo...,"[according, to, the, company, s, updated, stra...","[according, to, the, company, s, updated, stra...","[according, company, updated, strategy, year, ..."


In [None]:
#delete this row from data because it is empty at the end of cleaning
data = data.drop(2983)

In [None]:
for i in data['no_stopwords']:
  if len(i) == 0:
    print("----")
   #print(len(i))
  break

#Initializing GloVe word embedding 

In [None]:
embeddings_index = dict()
f = open('glove.6B.50d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['no_stopwords'], data['Sentiment'], test_size=0.1,random_state = 42)

# sentence to vector

In [None]:
def doc_to_vec(text):
  vec = []
  for w in text:
    
    if w in embeddings_index:
      vec.append(embeddings_index[w])
    else:
      vec.append(list(np.random.random(50)))
    
  return list(np.mean(np.array(vec), axis=0))

In [None]:
def apply_on_all_data(data):
  l = []
  for i in data:
    l.append(doc_to_vec(i))

  return np.array(l)


In [None]:
X_train_vec = apply_on_all_data(X_train)
X_test_vec = apply_on_all_data(X_test)

In [None]:
X_train_vec.shape

(4360, 50)

In [None]:
vec_X_train = X_train.apply(lambda x:doc_to_vec(x))
vec_X_test = X_test.apply(lambda x:doc_to_vec(x))

# LogisticRegression

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
y_train_encode = lb.fit_transform(y_train)

In [None]:
y_test_encode = lb.fit_transform(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [None]:
y_train_encode


array([1, 1, 1, ..., 1, 1, 2])

In [None]:
clf.fit(X_train_vec, y_train_encode)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred = clf.predict(X_test_vec)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test_encode,y_pred))

              precision    recall  f1-score   support

           0       0.62      0.43      0.51        60
           1       0.69      0.88      0.78       280
           2       0.55      0.34      0.42       145

    accuracy                           0.66       485
   macro avg       0.62      0.55      0.57       485
weighted avg       0.64      0.66      0.64       485

