- Direct representation for the entire document (sentence/paragraph) rather than each word.

### Sentimental Analysis - Emotion in Text

In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\veera\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('./data/train_data.csv')
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [4]:
df['sentiment'].value_counts()

sentiment
worry         7433
neutral       6340
sadness       4828
happiness     2986
love          2068
surprise      1613
hate          1187
fun           1088
relief        1021
empty          659
enthusiasm     522
boredom        157
anger           98
Name: count, dtype: int64

In [6]:
sents = ['love','sadness','happiness']
df_sub = df[df['sentiment'].isin(sents)]
len(df_sub)

9882

### Text Preprocessing

In [7]:
tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)
mystopwords = set(stopwords.words("english"))

In [8]:
def preprocess_corpus(texts):
    def remove_stops_digits(tokens):
        #Nested function that removes stopwords and digits from a list of tokens
        return [token for token in tokens if token not in mystopwords and not token.isdigit()]
    #This return statement below uses the above function to process twitter tokenizer output further. 
    return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]

In [9]:
x = preprocess_corpus(df_sub['content'])
y = df_sub['sentiment']
print(len(x),len(y))

9882 9882


In [10]:
train_data, test_data, train_cats, test_cats = train_test_split(x,y,random_state=1234)

- TaggedDocument class. It’s used to represent a document as a list of tokens followed by a “tag,” which in its simplest
form can be just the filename or ID of the document.

In [11]:
train_data

[['love', '.'],
 ['thanks',
  ',',
  'mandy',
  '!',
  'good',
  'sister',
  '.',
  'may',
  'true',
  ',',
  'unfortunately',
  'road',
  'tires',
  "can't",
  'say',
  '.'],
 ['oops', ',', 'drunken', 'stupor', 'lol', '.', 'ima', 'check'],
 ['cuddling', '.', 'girl', 'cuddles', 'fun', 'soft'],
 ['understands', 'long', 'hour'],
 ['scramming', 'little', 'early', '.', 'nibs', 'sounds', 'pathetic', '.'],
 ['*',
  'sigh',
  '*',
  "i'm",
  'going',
  'bed',
  '...',
  'feel',
  'right',
  'anymore',
  '...'],
 ['ok', '...', '.'],
 ['http://twitpic.com/4vd89',
  '-',
  'good',
  'looking',
  'cpl',
  '!',
  '!',
  "can't",
  'wait',
  'see',
  'kind',
  'kids',
  "y'all",
  '!'],
 ["caan't",
  'iht',
  'earlier',
  '?',
  'icant',
  'wait',
  'long',
  '.',
  'ahar',
  '.',
  '(:'],
 ['boy',
  'leaving',
  'summer',
  ',',
  'going',
  'stay',
  'grandparents',
  '..',
  "i'm",
  'gonna',
  'miss',
  '!',
  '!'],
 ['thanks', 'ff', 'think', 'start'],
 ['yeah',
  'double',
  'edged',
  'sword'

In [12]:
train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]

In [13]:
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)

- Doc2vec’s infer_vector function can be used
to infer the vector representation for a given text using a pre-trained model.

In [15]:
train_vectors =  [model.infer_vector(list_of_tokens) for list_of_tokens in train_data]
test_vectors = [model.infer_vector(list_of_tokens) for list_of_tokens in test_data]

In [16]:
#Use any regular classifier like logistic regression
from sklearn.linear_model import LogisticRegression

myclass = LogisticRegression(class_weight="balanced") #because classes are not balanced. 
myclass.fit(train_vectors, train_cats)

preds = myclass.predict(test_vectors)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(test_cats, preds))

              precision    recall  f1-score   support

   happiness       0.47      0.38      0.42       694
        love       0.35      0.54      0.43       534
     sadness       0.73      0.64      0.68      1243

    accuracy                           0.55      2471
   macro avg       0.52      0.52      0.51      2471
weighted avg       0.57      0.55      0.55      2471

