# 1. File I/O and Sentiment Data Aggregation
## TaeYoung Kang (minvv23@gmail.com)

In [7]:
import sys
import glob
import os
import re
import pickle
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
# ISKCON Data
iskcondata = pd.read_excel('ISKCON_data.xlsx', sheet_name='duplicates_removed')
iskcondata.columns = map(lambda x : '_'.join(x.lower().split()), iskcondata.columns)
print('Original label names are', set(iskcondata['review_type']))

iskcondata['label'] = iskcondata['review_type'].replace({'POSITIVE':1, 'NEGATIVE':-1, 'MIXED':0, 'NEUTRAL':0})
print('Edited label names are', set(iskcondata['label']), '\n')

print('The data size is', len(iskcondata))
print('The Final ISKON data format would be...')
iskcondata = iskcondata[['text', 'label']]
iskcondata.head(5)

Original label names are {'NEUTRAL', 'NEGATIVE', 'MIXED', 'POSITIVE'}
Edited label names are {0, 1, -1} 

The data size is 4641
The Final ISKON data format would be...


Unnamed: 0,text,label
0,This reativly new temple was a big hindu versi...,-1
1,Me and my friends enjoyed a lot in ISKCON temp...,1
2,Otherworldly vibrations throuout the sanctuary...,1
3,"ISKCON temple is very good, located in west of...",1
4,This is a very good place to be for all the de...,1


In [3]:
# Twitter Sentiment Data
tweet_negative = np.vectorize(lambda x : ' '.join(x.split()))(pd.read_csv('./other_sentiment_dataset/twittertweets_processedNegative.csv').columns.to_series())
tweet_neutral = np.vectorize(lambda x : ' '.join(x.split()))(pd.read_csv('./other_sentiment_dataset/twittertweets_processedNeutral.csv').columns.to_series())
tweet_positive = np.vectorize(lambda x : ' '.join(x.split()))(pd.read_csv('./other_sentiment_dataset/twittertweets_processedPositive.csv').columns.to_series())
twitterdata = pd.DataFrame(zip(np.concatenate((tweet_negative, tweet_neutral, tweet_positive)),
                                 np.concatenate((np.repeat(-1, len(tweet_negative)),
                                                 np.repeat(0, len(tweet_neutral)),
                                                 np.repeat(1, len(tweet_positive))))),
                            columns = ['text', 'label'])
print('The data size is', len(twitterdata))
twitterdata.head(5)

The data size is 3873


Unnamed: 0,text,label
0,How unhappy some dogs like it though,-1
1,talking to my over driver about where I'm goin...,-1
2,Does anybody know if the Rand's likely to fall...,-1
3,I miss going to gigs in Liverpool unhappy,-1
4,There isnt a new Riverdale tonight ? unhappy,-1


In [4]:
# Airline Review Sentiment Data

airlinedata = pd.read_csv('./other_sentiment_dataset/twitter_airline_sentiment_tweets.csv')
airlinedata['text'] = np.vectorize(lambda x : ' '.join(re.sub(r'[@]\w+', '', x).split()))(airlinedata['text'])
airlinedata['label'] = airlinedata['airline_sentiment'].replace({'neutral':0, 'negative':-1, 'positive':1})
airlinedata = airlinedata[['text', 'label']]

print('The data size is', len(airlinedata))
airlinedata.head(5)

The data size is 14640


Unnamed: 0,text,label
0,What said.,0
1,plus you've added commercials to the experienc...,1
2,I didn't today... Must mean I need to take ano...,0
3,"it's really aggressive to blast obnoxious ""ent...",-1
4,and it's a really big bad thing about it,-1


In [5]:
finaldata = pd.concat([iskcondata, twitterdata, airlinedata], axis=0, ignore_index=True)
finaldata = pd.concat([finaldata, pd.Series(np.concatenate((np.repeat('iskcon', len(iskcondata)), np.repeat('twitter', len(twitterdata)), np.repeat('airline', len(airlinedata))))).rename('datafrom')], axis=1)
finaldata.head(5)

Unnamed: 0,text,label,datafrom
0,This reativly new temple was a big hindu versi...,-1,iskcon
1,Me and my friends enjoyed a lot in ISKCON temp...,1,iskcon
2,Otherworldly vibrations throuout the sanctuary...,1,iskcon
3,"ISKCON temple is very good, located in west of...",1,iskcon
4,This is a very good place to be for all the de...,1,iskcon


In [11]:
Counter(finaldata['label'])

Counter({-1: 10565, 1: 7551, 0: 5038})

In [6]:
with open('finaldata.p', 'wb') as f :
    pickle.dump(finaldata, f)