# Sentiment Analysis

In [1]:
# import packages
import pandas as pd
import multiprocessing as mp
import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as sid

In [2]:
# import data
data = pd.read_csv('filtered_data_odd.csv', index_col = 0)
data.head()

Unnamed: 0,author,subreddit,created_utc,score,controversiality,body
0,CryptoHODLer101,AMA,1513292107,0,0,No BTC is not fiat. I get paid in Bitcoin. As ...
1,nappiestapparatus,AMA,1395194709,0,0,I think you're overestimating your hashing pow...
2,Skating2Death,AMA,1390016247,0,0,"I've heard of it, but with the high volatility..."
3,OmarJunkman,AMA,1528300155,0,0,Do you have any bitcoin?
4,evanc1411,ASU,1525380807,0,1,"BTC -? Damn, when you buy a Bitcoin you're ..."


## method 1
- #### single thread

```Python
# initialize sentiment classifier
sid = SID()

# get sentiment scores from data
sentiment = data['body'].apply(SID.polarity_scores)

# convert sentiment series into dataframe (each sentiment value gets its own column)
sentiment = pd.DataFrame(sentiment.tolist())

# merge data and sentiment df into one
data = data.merge(sentiment, how = 'left', left_index = True, right_index = True)

# delete sentiment as its info is in data
del sentiment
```

## method 2
- #### method 1 but as a function

```python
def get_sentiment(df, series = str):
    
    # initialize sentiment classifier
    sid = SID()
    
   # get sentiment
    sentiment = df[series].apply(SID.polarity_scores)
    
    # create sentiment df
    sentiment = pd.DataFrame(sentiment.tolist())
    
    # merge sentiment with your df
    return df.merge(sentiment, how = 'left', left_index = True, right_index = True)

# run function
data = get_sentiment(df = data, series = 'body')
```

## method 3
- #### parallel processing

In [3]:
# initialize sentiment classifier
SID = sid()

In [4]:
# number of parallel processes to create
# should be number of cpu cores that your computer has
num_processes = mp.cpu_count()
num_processes

8

The parallel processing package doesn't have a version of pandas `.apply`, it has<br>
a version of `map` (which is a list's version of apply). This means we need to transform<br>
our data into a list and then use the map function. below are some examples:
​
```Python
# list comprehension = this is the function that is going to be parallel processed
test_list = [i for i in map(SID.polarity, data['body'])]
​
# same thing as list comprehension above
test_list = []
for i in data['body']:
    results = SID.polarity_scores(i)
    test_list.append(results)
```

In [5]:
# parallel process with progress bar

def main():
    with mp.Pool(num_processes) as pool:
        return[i for i in pool.map(SID.polarity_scores, tqdm.tqdm(data['body']), chunksize = 10)]
        

if __name__ == '__main__':
       data = data.merge(pd.DataFrame(main()), how = 'left', left_index = True, right_index = True)

100%|██████████| 1607782/1607782 [20:42<00:00, 1294.23it/s]


In [6]:
data.head()

Unnamed: 0,author,subreddit,created_utc,score,controversiality,body,compound,neg,neu,pos
0,CryptoHODLer101,AMA,1513292107,0,0,No BTC is not fiat. I get paid in Bitcoin. As ...,0.3818,0.085,0.731,0.185
1,nappiestapparatus,AMA,1395194709,0,0,I think you're overestimating your hashing pow...,-0.2263,0.181,0.724,0.095
2,Skating2Death,AMA,1390016247,0,0,"I've heard of it, but with the high volatility...",-0.7684,0.142,0.837,0.021
3,OmarJunkman,AMA,1528300155,0,0,Do you have any bitcoin?,0.0,0.0,1.0,0.0
4,evanc1411,ASU,1525380807,0,1,"BTC -? Damn, when you buy a Bitcoin you're ...",-0.1779,0.213,0.63,0.157


## categorize sentiment data into sentiment categories

In [7]:
# create function to categorize compound sentiment score
# 0.05 threshold recommended on VADER documentation
# you should read the comments, read its compound score, and determine your own cuttoffs
def categorize_sentiment(x):
    if x >= 0.05:
        return 'positive_comment'
    elif 0.05 > x > -0.05:
        return 'neutral_comment'
    elif -0.05 >= x:
        return 'negative_comment'

In [8]:
# apply function categorize_sentiment to ['compound']
data['sentiment'] = data['compound'].apply(categorize_sentiment)

data.head()

Unnamed: 0,author,subreddit,created_utc,score,controversiality,body,compound,neg,neu,pos,sentiment
0,CryptoHODLer101,AMA,1513292107,0,0,No BTC is not fiat. I get paid in Bitcoin. As ...,0.3818,0.085,0.731,0.185,positive_comment
1,nappiestapparatus,AMA,1395194709,0,0,I think you're overestimating your hashing pow...,-0.2263,0.181,0.724,0.095,negative_comment
2,Skating2Death,AMA,1390016247,0,0,"I've heard of it, but with the high volatility...",-0.7684,0.142,0.837,0.021,negative_comment
3,OmarJunkman,AMA,1528300155,0,0,Do you have any bitcoin?,0.0,0.0,1.0,0.0,neutral_comment
4,evanc1411,ASU,1525380807,0,1,"BTC -? Damn, when you buy a Bitcoin you're ...",-0.1779,0.213,0.63,0.157,negative_comment


In [9]:
# convert ['sentiment'] to categorical data type
data['sentiment'] = pd.Categorical(data['sentiment'])

#sentiment should be category
data.dtypes

author                object
subreddit             object
created_utc            int64
score                  int64
controversiality       int64
body                  object
compound             float64
neg                  float64
neu                  float64
pos                  float64
sentiment           category
dtype: object

In [10]:
# convert ['sentiment'] categories to binary variables in new df
binary_sentiment = data['sentiment'].str.get_dummies()
binary_sentiment.head()

Unnamed: 0,negative_comment,neutral_comment,positive_comment
0,0,0,1
1,1,0,0
2,1,0,0
3,0,1,0
4,1,0,0


In [11]:
# count of how many of each category were classified
binary_sentiment.sum()

negative_comment    518819
neutral_comment     177084
positive_comment    911879
dtype: int64

In [12]:
# merge binary_sentiment with data 
data = data.merge(binary_sentiment, how = 'left', left_index = True, right_index = True)
data.head()

Unnamed: 0,author,subreddit,created_utc,score,controversiality,body,compound,neg,neu,pos,sentiment,negative_comment,neutral_comment,positive_comment
0,CryptoHODLer101,AMA,1513292107,0,0,No BTC is not fiat. I get paid in Bitcoin. As ...,0.3818,0.085,0.731,0.185,positive_comment,0,0,1
1,nappiestapparatus,AMA,1395194709,0,0,I think you're overestimating your hashing pow...,-0.2263,0.181,0.724,0.095,negative_comment,1,0,0
2,Skating2Death,AMA,1390016247,0,0,"I've heard of it, but with the high volatility...",-0.7684,0.142,0.837,0.021,negative_comment,1,0,0
3,OmarJunkman,AMA,1528300155,0,0,Do you have any bitcoin?,0.0,0.0,1.0,0.0,neutral_comment,0,1,0
4,evanc1411,ASU,1525380807,0,1,"BTC -? Damn, when you buy a Bitcoin you're ...",-0.1779,0.213,0.63,0.157,negative_comment,1,0,0


In [13]:
# delete redundant variables
del data['pos']
del data['neg']
del data['neu']
del data['compound']
del data['controversiality']
del data['body']
del data['sentiment']
del binary_sentiment

In [None]:
# export data
data.to_csv('sentiment_data_odd.csv')