**IMPORTING LIBRARIES**

In [None]:
import spacy as sp
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import pandas as pd

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
import re

In [None]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

**DATASET INFORMATION**

In [None]:
df=pd.read_csv('train.csv', encoding='latin1')

In [None]:
df=df[['sentiment','text']]

In [None]:
df=df.dropna()

In [None]:
df.shape

(27480, 2)

In [None]:
df

Unnamed: 0,sentiment,text
0,neutral,"I`d have responded, if I were going"
1,negative,Sooo SAD I will miss you here in San Diego!!!
2,negative,my boss is bullying me...
3,negative,what interview! leave me alone
4,negative,"Sons of ****, why couldn`t they put them on t..."
...,...,...
27476,negative,wish we could come see u on Denver husband l...
27477,negative,I`ve wondered about rake to. The client has ...
27478,positive,Yay good for both of you. Enjoy the break - y...
27479,positive,But it was worth it ****.


**PREPROCESSING OF TEXT**

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

for index, row in df.iterrows():
    text = row['text']
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and not token in stop_words]  # Lemmatize,filter out non-alphabetic tokens,remove stopwords
    dataaa = " ".join(tokens)
    df.at[index, 'processed__text'] = dataaa


In [None]:
df

Unnamed: 0,sentiment,text,processed__text
0,neutral,"I`d have responded, if I were going",responded going
1,negative,Sooo SAD I will miss you here in San Diego!!!,sooo sad miss san diego
2,negative,my boss is bullying me...,bos bullying
3,negative,what interview! leave me alone,interview leave alone
4,negative,"Sons of ****, why couldn`t they put them on t...",son put release already bought
...,...,...,...
27476,negative,wish we could come see u on Denver husband l...,wish could come see u denver husband lost job ...
27477,negative,I`ve wondered about rake to. The client has ...,wondered rake client made clear force devs lea...
27478,positive,Yay good for both of you. Enjoy the break - y...,yay good enjoy break probably need hectic week...
27479,positive,But it was worth it ****.,worth


**EMOJI EXTRACTION**

In [None]:
def extract_emojis(text):
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # Emoticons
                               "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                               "]+", flags=re.UNICODE)
    emojis = emoji_pattern.findall(text)
    return emojis

In [None]:
df['Emojis'] = df['text'].apply(lambda x: extract_emojis(x))

In [None]:
df

Unnamed: 0,sentiment,text,processed__text,Emojis
0,neutral,"I`d have responded, if I were going",responded going,[]
1,negative,Sooo SAD I will miss you here in San Diego!!!,sooo sad miss san diego,[]
2,negative,my boss is bullying me...,bos bullying,[]
3,negative,what interview! leave me alone,interview leave alone,[]
4,negative,"Sons of ****, why couldn`t they put them on t...",son put release already bought,[]
...,...,...,...,...
27476,negative,wish we could come see u on Denver husband l...,wish could come see u denver husband lost job ...,[]
27477,negative,I`ve wondered about rake to. The client has ...,wondered rake client made clear force devs lea...,[]
27478,positive,Yay good for both of you. Enjoy the break - y...,yay good enjoy break probably need hectic week...,[]
27479,positive,But it was worth it ****.,worth,[]


**EXCLAMATION MARK EXTRACTION**

In [None]:
def extract_exclamation_marks(text):
    exclamation_marks = re.findall(r'!', text)
    return exclamation_marks


In [None]:
df['Exclamation Marks'] = df['text'].apply(lambda x: extract_exclamation_marks(x))

In [None]:
df

Unnamed: 0,sentiment,text,processed__text,Emojis,Exclamation Marks
0,neutral,"I`d have responded, if I were going",responded going,[],[]
1,negative,Sooo SAD I will miss you here in San Diego!!!,sooo sad miss san diego,[],"[!, !, !]"
2,negative,my boss is bullying me...,bos bullying,[],[]
3,negative,what interview! leave me alone,interview leave alone,[],[!]
4,negative,"Sons of ****, why couldn`t they put them on t...",son put release already bought,[],[]
...,...,...,...,...,...
27476,negative,wish we could come see u on Denver husband l...,wish could come see u denver husband lost job ...,[],[]
27477,negative,I`ve wondered about rake to. The client has ...,wondered rake client made clear force devs lea...,[],[]
27478,positive,Yay good for both of you. Enjoy the break - y...,yay good enjoy break probably need hectic week...,[],[]
27479,positive,But it was worth it ****.,worth,[],[]


**SENTIMENTAL WORD EXTRACTION**

In [None]:
def extract_sentiment_words(processed__text):
  sid = SentimentIntensityAnalyzer()
  #posword=set()
  #neuword=set()
  #negword=set()
  #words=text.split()

  posword=0
  negword=0
  neuword=0
  sentences = nltk.sent_tokenize(processed__text)
  for word in sentences:
    words = nltk.word_tokenize(word)
    for x in words:
      score=sid.polarity_scores(x)
      if (score['compound']) >=  0.3:
        posword+=1
      elif (score['compound']) <= -0.3:
        negword+=1
      else:
        neuword+=1

    return posword, negword, neuword

df[['Positive Words', 'Negative Words', 'Neutral Words']] = df['processed__text'].apply(lambda x: pd.Series(extract_sentiment_words(x)))

In [None]:
df

Unnamed: 0,sentiment,text,processed__text,Emojis,Exclamation Marks,Positive Words,Negative Words,Neutral Words
0,neutral,"I`d have responded, if I were going",responded going,[],[],0.0,0.0,2.0
1,negative,Sooo SAD I will miss you here in San Diego!!!,sooo sad miss san diego,[],"[!, !, !]",0.0,1.0,4.0
2,negative,my boss is bullying me...,bos bullying,[],[],0.0,1.0,1.0
3,negative,what interview! leave me alone,interview leave alone,[],[!],0.0,0.0,3.0
4,negative,"Sons of ****, why couldn`t they put them on t...",son put release already bought,[],[],0.0,0.0,5.0
...,...,...,...,...,...,...,...,...
27476,negative,wish we could come see u on Denver husband l...,wish could come see u denver husband lost job ...,[],[],1.0,1.0,8.0
27477,negative,I`ve wondered about rake to. The client has ...,wondered rake client made clear force devs lea...,[],[],1.0,0.0,11.0
27478,positive,Yay good for both of you. Enjoy the break - y...,yay good enjoy break probably need hectic week...,[],[],4.0,0.0,8.0
27479,positive,But it was worth it ****.,worth,[],[],0.0,0.0,1.0




In [None]:
df=df[['processed__text','Positive Words', 'Negative Words', 'Neutral Words', 'sentiment']]

In [None]:
df

Unnamed: 0,processed__text,Positive Words,Negative Words,Neutral Words,sentiment
0,responded going,0.0,0.0,2.0,neutral
1,sooo sad miss san diego,0.0,1.0,4.0,negative
2,bos bullying,0.0,1.0,1.0,negative
3,interview leave alone,0.0,0.0,3.0,negative
4,son put release already bought,0.0,0.0,5.0,negative
...,...,...,...,...,...
27476,wish could come see u denver husband lost job ...,1.0,1.0,8.0,negative
27477,wondered rake client made clear force devs lea...,1.0,0.0,11.0,negative
27478,yay good enjoy break probably need hectic week...,4.0,0.0,8.0,positive
27479,worth,0.0,0.0,1.0,positive


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df['sentiment'] = label_encoder.fit_transform(df['sentiment'])

print(df['sentiment'].unique())

[1 0 2]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = label_encoder.fit_transform(df['sentiment'])


In [None]:
df

Unnamed: 0,processed__text,Positive Words,Negative Words,Neutral Words,sentiment
0,responded going,0.0,0.0,2.0,1
1,sooo sad miss san diego,0.0,1.0,4.0,0
2,bos bullying,0.0,1.0,1.0,0
3,interview leave alone,0.0,0.0,3.0,0
4,son put release already bought,0.0,0.0,5.0,0
...,...,...,...,...,...
27476,wish could come see u denver husband lost job ...,1.0,1.0,8.0,0
27477,wondered rake client made clear force devs lea...,1.0,0.0,11.0,0
27478,yay good enjoy break probably need hectic week...,4.0,0.0,8.0,2
27479,worth,0.0,0.0,1.0,2


In [None]:
df.to_csv('extracted.csv')