In [1]:
import pandas as pd
import numpy as np


In [2]:
json_data = pd.read_csv('wsb.csv')

In [3]:
json_data.head()

Unnamed: 0,body,score,date
0,"Lol. Yeah, Welp.",1,2014-10-09
1,Crypto miners is not a significant enough mark...,1,2014-02-19
2,Sold a covered call at 560. I almost wish it d...,1,2014-04-24
3,NFLX because earning reports releases. +10-12%...,1,2014-01-20
4,Agreed 100%. When are their earnings? I'd like...,1,2014-07-25


In [4]:
json_data['body']

0                                           Lol. Yeah, Welp.
1          Crypto miners is not a significant enough mark...
2          Sold a covered call at 560. I almost wish it d...
3          NFLX because earning reports releases. +10-12%...
4          Agreed 100%. When are their earnings? I'd like...
                                 ...                        
2815058                  All I gotta say is... calm yo tits.
2815059                                     true tho in't it
2815060    Gold is a key element in electronics. The bot ...
2815061    Listen up kids as I tell you how I earned my f...
2815062    sometimes you should sell with a loss. Jnug ha...
Name: body, Length: 2815063, dtype: object

## Lets Get the length of the characters present in 'body' column.

In [5]:
def get_len(text):
    body_cont_len = len(str(text))
    return body_cont_len

In [6]:
json_data['cont_len'] = json_data['body'].apply(get_len)
json_data['cont_len']

0            16
1           434
2           159
3            72
4            71
           ... 
2815058      35
2815059      16
2815060      58
2815061    1662
2815062     232
Name: cont_len, Length: 2815063, dtype: int64

# Sentiment Analysis

Let's get the sentiment Analysis from the content of body.


In [7]:
import nltk 
from nltk.tokenize import word_tokenize
from textblob import TextBlob,Word
import unicodedata
import string
import re

## 1. Data Cleaning
    
    We can see the content of 'body' column is not a clean data. It contains lots of 
    i. Punctuations
    ii. Numbers
    iii. Stopwords 
    These things is not going to give any value to the text but only increase the consumption of memory. So, before going for Sentimental Analysis let's clean the data.

### i. Remove Punctuations
     Let's clean all the Punctuations like ?, !, # and so on from the text.

In [8]:
def remove_punctuations(text):
    translator = str.maketrans('', '', string.punctuation)
    word = str(text).translate(translator) 
    return word

In [9]:
json_data['body'] = json_data['body'].apply(remove_punctuations)
json_data['body']

0                                              Lol Yeah Welp
1          Crypto miners is not a significant enough mark...
2          Sold a covered call at 560 I almost wish it dr...
3          NFLX because earning reports releases 1012 for...
4          Agreed 100 When are their earnings Id like to ...
                                 ...                        
2815058                      All I gotta say is calm yo tits
2815059                                      true tho int it
2815060    Gold is a key element in electronics The bot h...
2815061    Listen up kids as I tell you how I earned my f...
2815062    sometimes you should sell with a loss Jnug has...
Name: body, Length: 2815063, dtype: object

### ii. Remove Numbers

    Let's remove all the numbers from the text since it not going to add any value in the text for Sentimental Analysis.

In [10]:
def remove_numbers(text):   
    pat = r'[0-9]'
    nltk_cleaned = re.sub(pat,'',text)
    return nltk_cleaned

In [11]:
json_data['body'] = json_data.apply(lambda x: remove_numbers(x['body']),axis =1)
json_data['body']

0                                              Lol Yeah Welp
1          Crypto miners is not a significant enough mark...
2          Sold a covered call at  I almost wish it drops...
3          NFLX because earning reports releases  for the...
4          Agreed  When are their earnings Id like to jum...
                                 ...                        
2815058                      All I gotta say is calm yo tits
2815059                                      true tho int it
2815060    Gold is a key element in electronics The bot h...
2815061    Listen up kids as I tell you how I earned my f...
2815062    sometimes you should sell with a loss Jnug has...
Name: body, Length: 2815063, dtype: object

## 2. Remove Special and Accented Characters

    Special/Accented Characters like a`,b`,... will not add any value in the text. So, let's remove it from the text.

   

In [12]:
def remove_accented_chars(text):
    nltk_accented = unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return nltk_accented

In [13]:
json_data['body'] = json_data.apply(lambda x: remove_accented_chars(x['body']),axis =1)
json_data['body']

0                                              Lol Yeah Welp
1          Crypto miners is not a significant enough mark...
2          Sold a covered call at  I almost wish it drops...
3          NFLX because earning reports releases  for the...
4          Agreed  When are their earnings Id like to jum...
                                 ...                        
2815058                      All I gotta say is calm yo tits
2815059                                      true tho int it
2815060    Gold is a key element in electronics The bot h...
2815061    Listen up kids as I tell you how I earned my f...
2815062    sometimes you should sell with a loss Jnug has...
Name: body, Length: 2815063, dtype: object

# 2. Tokenization

    Tokens helps to know the meaning of text. So, lets break the text into smaller units, Here we'll go for word tokenization.

In [14]:
def get_tokens(text):
    nltk_tokens = nltk.word_tokenize(str(text))         
    return nltk_tokens

In [15]:
json_data['body'] = json_data['body'].apply(get_tokens)
json_data['body']

0                                          [Lol, Yeah, Welp]
1          [Crypto, miners, is, not, a, significant, enou...
2          [Sold, a, covered, call, at, I, almost, wish, ...
3          [NFLX, because, earning, reports, releases, fo...
4          [Agreed, When, are, their, earnings, Id, like,...
                                 ...                        
2815058           [All, I, got, ta, say, is, calm, yo, tits]
2815059                                 [true, tho, int, it]
2815060    [Gold, is, a, key, element, in, electronics, T...
2815061    [Listen, up, kids, as, I, tell, you, how, I, e...
2815062    [sometimes, you, should, sell, with, a, loss, ...
Name: body, Length: 2815063, dtype: object

# 3. Remove Stop words

    Stopwords like The, And, Myself, this.... will not add much value to the sentence.So, its better to get rid of them.

In [16]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/bidhya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    
    clean_text = [word for word in text if not word.lower() in stop_words]
#     clean_text = []
    
#     for word in text:
#     if word not in stop_words:
#         clean_text.append(word)
    return clean_text

In [18]:
json_data['body'] = json_data.apply(lambda x: remove_stopwords(x['body']),axis =1)
json_data['body']

0                                          [Lol, Yeah, Welp]
1          [Crypto, miners, significant, enough, market, ...
2          [Sold, covered, call, almost, wish, drops, bac...
3            [NFLX, earning, reports, releases, week, guess]
4             [Agreed, earnings, Id, like, jump, puts, well]
                                 ...                        
2815058                       [got, ta, say, calm, yo, tits]
2815059                                     [true, tho, int]
2815060        [Gold, key, element, electronics, bot, hates]
2815061    [Listen, kids, tell, earned, flair, January, l...
2815062    [sometimes, sell, loss, Jnug, hasnt, since, Tr...
Name: body, Length: 2815063, dtype: object

# 4. Lemmatization

    Since, Lemmatization reduces the word to its dictionary form. Lets do lemmatization over stemming.

In [19]:
# import spacy

# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [20]:
# def get_lem(text):
#     text = nlp(text)
#     text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
#     return text

def get_lem(text):
    lem_word= [Word(word).lemmatize("v") for word in text]
    return lem_word

In [21]:
json_data['body'] = json_data.apply(lambda x: get_lem(x['body']),axis =1)
json_data['body']

0                                          [Lol, Yeah, Welp]
1          [Crypto, miners, significant, enough, market, ...
2          [Sold, cover, call, almost, wish, drop, back, ...
3                 [NFLX, earn, report, release, week, guess]
4                  [Agreed, earn, Id, like, jump, put, well]
                                 ...                        
2815058                       [get, ta, say, calm, yo, tits]
2815059                                     [true, tho, int]
2815060          [Gold, key, element, electronics, bot, hat]
2815061    [Listen, kid, tell, earn, flair, January, last...
2815062    [sometimes, sell, loss, Jnug, hasnt, since, Tr...
Name: body, Length: 2815063, dtype: object

## Convert the list to string

In [22]:
def list_to_string(text):
    listToStr = ' '.join([str(elem) for elem in text])
    return listToStr 

In [23]:
json_data['body'] = json_data.apply(lambda x: list_to_string(x['body']),axis =1)
json_data['body']

0                                              Lol Yeah Welp
1          Crypto miners significant enough market yet AM...
2          Sold cover call almost wish drop back bite thi...
3                        NFLX earn report release week guess
4                          Agreed earn Id like jump put well
                                 ...                        
2815058                              get ta say calm yo tits
2815059                                         true tho int
2815060                 Gold key element electronics bot hat
2815061    Listen kid tell earn flair January last year m...
2815062    sometimes sell loss Jnug hasnt since Trump ele...
Name: body, Length: 2815063, dtype: object

# 5. Get Polarity

    More than personal opinion, we prefer sentimental aspect of an opinion. So, Lets go for Polarity.
    
    i. Lets get the polarity in number first

In [24]:
def get_polarity(text):
    textblob = TextBlob(str(text.encode('utf-8')))
    pol = textblob.sentiment.polarity
        
    return pol

In [25]:
json_data['cont_pol'] = json_data.apply(lambda x: get_polarity(x['body']),axis =1)
json_data['cont_pol'] 

0          0.800000
1         -0.037771
2          0.000000
3          0.000000
4          0.000000
             ...   
2815058    0.300000
2815059    0.350000
2815060    0.000000
2815061    0.057047
2815062    0.000000
Name: cont_pol, Length: 2815063, dtype: float64

`ii. Let's Analyze if the polarity sentiment is 'Positive', 'Negative' or 'Neutral'

In [26]:
 def analysis(score):
    if score < 0:
        return 'Negative'
    elif score > 0:
        return 'Positive'
    else:
        return 'Neutral'

In [27]:
json_data['cont_sent'] = json_data.apply(lambda x: analysis(x['cont_pol']),axis =1)
json_data['cont_sent'] 

In [28]:
json_data.head()

## Let's drop 'body' column.

In [29]:
json_data = json_data.drop(columns='body')
json_data.head()

In [30]:
json_data['date']

# 6. Save the dataframe

In [None]:
json_data.to_csv('analyzedJSON_data.csv') 