In [326]:
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import nltk
# A library which cleans URL, Mentions, Hashtags for preprocessing tweets
import preprocessor as p
import re
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [327]:
# Reading DataSet
cols = ["id", "tweet", "emotion", "intensity"]
joy_data = pd.read_csv("joy-ratings-0to1.train.txt", sep='\t', names=cols, header=None)
joy_data_test = pd.read_csv("joy-ratings-0to1.test.gold.txt", sep='\t', names=cols, header=None)
merged=pd.concat([joy_data,joy_data_test],ignore_index=True)
# merged=anger_data.append(anger_data_test)
# merged
print(len(joy_data))
print(len(joy_data_test))
print(len(merged))

823
714
1537


In [328]:
#For the purpose of training and features extraction train set contains both test and train tweets
tweets_train=merged['tweet']
tweets_train2=merged['tweet']
tweets_test=joy_data_test['tweet']
tweets_test2=joy_data_test['tweet']
# tweets_train2

In [329]:
# Some preprocessing here
# Lower case words
tweets_train=tweets_train.apply(lambda s:s.lower())
tweets_train2=tweets_train2.apply(lambda s:s.lower())
tweets_test=tweets_test.apply(lambda s:s.lower())
tweets_test2=tweets_test2.apply(lambda s:s.lower())
for i in range(len(tweets_train)):
    #Removing Urls, mentions, hashtags
    res=p.clean(tweets_train[i])
    # removing punctuations
    res = re.sub(r'[^\w\s]', '',res)
    tweets_train[i]=res
for i in range(len(tweets_test)):
    res=p.clean(tweets_test[i])
    # removing punctuations
    res = re.sub(r'[^\w\s]', '',res)
    tweets_test[i]=res
# tweets_test2

In [330]:
# VADER feature - give it tweets as it is without preprocessed
obj= SentimentIntensityAnalyzer()
Pos_vader=np.zeros(len(tweets_train2))
Neg_vader=np.zeros(len(tweets_train2))
Neu_vader=np.zeros(len(tweets_train2))
Comp_vader=np.zeros(len(tweets_train2))
for i in range(len(tweets_train2)):
    sentiment_dict = obj.polarity_scores(tweets_train2[i])
    Pos_vader[i]=sentiment_dict['pos']
    Neg_vader[i]=sentiment_dict['neg']
    Neu_vader[i]=sentiment_dict['neu']
    Comp_vader[i]=sentiment_dict['compound']
# Adding features into dataframe
merged['Pos_vader']=Pos_vader
merged['Neg_vader']=Neg_vader
merged['Neu_vader']=Neu_vader
merged['Comp_vader']=Comp_vader
cols=cols+['Pos_vader','Neg_vader','Neu_vader','Comp_vader']
# merged

In [331]:
# Polar word count feature- using MPQA and Bing liu
mpqa_pos=set()
mpqa_neg=set()
bing_pos=set()
bing_neg=set()
bing_data = pd.read_csv("./lexicons/1. BingLiu.csv", sep='\t', names=["words","sentiment"], header=None)
for i in range(len(bing_data['words'])):
    if bing_data['sentiment'][i]=='positive':
        bing_pos.add(bing_data['words'][i])
    elif bing_data['sentiment'][i]=='negative':
        bing_neg.add(bing_data['words'][i])
mpqa_data = pd.read_csv("./lexicons/2. mpqa.txt", sep='\t', names=["words","sentiment"], header=None)
for i in range(len(mpqa_data['words'])):
    if mpqa_data['sentiment'][i]=='positive':
        mpqa_pos.add(mpqa_data['words'][i])
    elif mpqa_data['sentiment'][i]=='negative':
        mpqa_neg.add(mpqa_data['words'][i])
polar_pos=mpqa_pos.union(bing_pos)
polar_neg=mpqa_neg.union(bing_neg)
polarcount_pos=np.zeros(len(tweets_train))
polarcount_neg=np.zeros(len(tweets_train))
for i in range(len(tweets_train)):
    tweet=tweets_train[i]
    for word in word_tokenize(tweet):
        if word in polar_pos:
            polarcount_pos[i]+=1
        elif word in polar_neg:
            polarcount_neg[i]+=1
# Vectors polarcount_pos, polarcount_neg contains final counts
# Adding features into dataframe
merged['polarcount_pos']=polarcount_pos
merged['polarcount_neg']=polarcount_neg
cols=cols+['polarcount_pos','polarcount_neg']
# merged

In [332]:
# Aggregate polarity scores:from lexicons: Sentiment140 (d1), Sentiwordnet(d2)
# For D1=Sentiment140
d1 = pd.read_csv("./lexicons/3. Sentiment140-Lexicon-v0.1/unigrams-pmilexicon.txt", sep='\t', names=['term','score','numPos','numNeg'], header=None)
dic_d1={}
for i in range(len(d1['term'])):
    dic_d1[d1['term'][i]]=d1['score'][i]
aggScore1=np.zeros(len(tweets_train))
for i in range(len(tweets_train)):
    tweet=tweets_train[i]
    for word in word_tokenize(tweet):
        if word in dic_d1:
            aggScore1[i]+=dic_d1[word]
# Vector aggScore1 contains Aggregate polarity scores for Sentiment140 data set
# Adding features into dataframe
merged['aggScore1']=aggScore1
cols=cols+['aggScore1']
# merged

In [333]:
# Important libraries and functions for SentiwordNet 
# Here words are lemmatized before fetching sentiment from corpus
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None
def get_sentiment(word,tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []
    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []
    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())
    return swn_synset.pos_score()-swn_synset.neg_score()

In [334]:
# For D2=Sentiwordnet
# Long code
aggScore2=np.zeros(len(tweets_train))
for i in range(len(tweets_train)):
    tweet=word_tokenize(tweets_train[i])
    pos_val = nltk.pos_tag(tweet)
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
    for j in senti_val:
        if type(j) is not list:
            aggScore2[i]+=j
# Vector aggScore2 contains Aggregate polarity scores for Sentiment140 data set
# Adding features into dataframe
merged['aggScore2']=aggScore2
cols=cols+['aggScore2']
# merged

In [335]:
# Aggregate polarity scores:from lexicons: AFFIN
from afinn import Afinn
afinn = Afinn(emoticons=True)
aggScore3=np.zeros(len(tweets_train2))
for i in range(len(tweets_train2)):
    tweet=tweets_train2[i]
    aggScore3[i]=afinn.score(tweet)
# Vector aggScore3 contains Aggregate polarity scores for AFFIN data set
# Adding features into dataframe
merged['aggScore3']=aggScore3
cols=cols+['aggScore3']
# merged

In [336]:
# Aggregate polarity scores (Hashtags):NRC Hashtag Sentiment
d1 = pd.read_csv("./lexicons/7. NRC-Hashtag-Sentiment-Lexicon-v0.1/unigrams-pmilexicon.txt", sep='\t', names=['term','score','numPos','numNeg'], header=None)
dic_d1={}
for i in range(len(d1['term'])):
    dic_d1[d1['term'][i]]=d1['score'][i]
# print(dic_d1)
aggScoreHashtags=np.zeros(len(tweets_train))
for i in range(len(tweets_train2)):
    tweet=tweets_train2[i]
    hashtags = [i  for i in tweet.split() if i.startswith("#") ]
    for word in hashtags:
        if word in dic_d1:
            aggScoreHashtags[i]+=dic_d1[word]
        elif word[1:] in dic_d1:
            aggScoreHashtags[i]+=dic_d1[word[1:]]
# print(aggScoreHashtags)
# Vector aggScoreHashtag contains (Hashtags):NRC Hashtag Sentiment
# Adding features into dataframe
merged['aggScoreHashtags']=aggScoreHashtags
cols=cols+['aggScoreHashtags']
# merged

In [337]:
#Aggregate emotion score: NRC-10 Expanded lexicon
emotion="joy"
d_emotion=pd.read_csv("./lexicons/6. NRC-10-expanded.csv", sep='\t')
dic_dEmo={}
for i in range(len(d_emotion['word'])):
    dic_dEmo[d_emotion['word'][i]]=d_emotion[emotion][i]
aggScoreEmo=np.zeros(len(tweets_train))
for i in range(len(tweets_train)):
    tweet=tweets_train[i]
    for word in word_tokenize(tweet):
        if word in dic_dEmo:
            aggScoreEmo[i]+=dic_dEmo[word]
# print(aggScoreEmo)
# Vector aggScoreEmo contains Aggregate emotion score
# Adding features into dataframe
merged['aggScoreEmo']=aggScoreEmo
cols=cols+['aggScoreEmo']
# merged

In [338]:
#Aggregate emotion score (Hashtags):NRC Hashtag Emotion Association Lexicon
emotion="joy"
d_emotion=pd.read_csv("./lexicons/5. NRC-Hashtag-Emotion-Lexicon-v0.2.txt", sep='\t', names=['emotion','word','score'], header=None)
d_emotion
dic_dEmo={}
for i in range(len(d_emotion['word'])):
    if d_emotion['emotion'][i]==emotion:
        dic_dEmo[d_emotion['word'][i]]=d_emotion['score'][i]
aggEmoHashtags=np.zeros(len(tweets_train))
for i in range(len(tweets_train2)):
    tweet=tweets_train2[i]
    hashtags = [j  for j in tweet.split() if j.startswith("#") ]
    for word in hashtags:
        if word in dic_dEmo:
            aggEmoHashtags[i]+=dic_dEmo[word]
        elif word[1:] in dic_dEmo:
            aggEmoHashtags[i]+=dic_dEmo[word[1:]]
# print(aggEmoHashtags)
# Vector aggEmoHashtags conatains Aggregate emotion score (Hashtags)
# Adding features into dataframe
merged['aggEmoHashtags']=aggEmoHashtags
cols=cols+['aggEmoHashtags']
# merged

In [339]:
#Emoticons score: Positive and negative AFINN project (Nielsen, 2011)

emoticons=pd.read_csv("./lexicons/9. AFINN-emoticon-8.txt", sep='\t', names=['emoticon','score'], header=None)
dic_emoji={}
for i in (range(len(emoticons['emoticon']))):
    dic_emoji[emoticons['emoticon'][i]]=emoticons['score'][i]
# print(dic_emoji)
emoji = re.compile('[\\u203C-\\u3299\\U0001F000-\\U0001F644]')
emojiScore=np.zeros(len(tweets_train2))
for i in (range(len(tweets_train2))):
    tweet=(tweets_train2[i].split(" "))
    for j in range(len(tweet)):
        if(tweet[j] in dic_emoji):
#             print(i,dic_emoji[i])
            emojiScore[i]+=dic_emoji[tweet[j]]
len(emojiScore)
# for i in emojiScore:
#     print(i,end=' ')
# Vector emojiScore contains the required feature
# Adding features into dataframe
merged['emojiScore']=emojiScore
cols=cols+['emojiScore']
# merged

In [340]:
# Count of Negating words
neg_count=np.zeros(len(tweets_train))
for i in range(len(tweets_train)):
    tweet=tweets_train[i]
    res = re.findall(r'(?:^|\W)?(never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint)(?:$|\W)+',tweet)
    neg_count[i]=len(res)
# print(neg_count)
# Vector neg_count contains the count of negating words
# Adding features into dataframe
merged['neg_count']=neg_count
cols=cols+['neg_count']
# merged

In [341]:
#Emotion word count: Count of the number of words matching each emotion from NRC Word-Emotion Association Lexicon
emotion="joy"
emotion_count=np.zeros(len(tweets_train))
emotions_df=pd.read_csv("./lexicons/8. NRC-word-emotion-lexicon.txt", sep='\t', names=['word','emotion','score'], header=None)
# This set contains words with Emotion X=1 in above dataframe
set_emo=set()
for i in range(len(emotions_df['word'])):
    if emotions_df['emotion'][i]==emotion and emotions_df['score'][i]==1:
        set_emo.add(emotions_df['word'][i])
for i in range(len(tweets_train)):
    tweet=word_tokenize(tweets_train[i])
    for word in tweet:
        if word in set_emo:
            emotion_count[i]+=1
# Vector emotion_count contains the required feature vector
# Adding features into dataframe
merged['emotion_count']=emotion_count
cols=cols+['emotion_count']
# merged

In [342]:
# N grams features ectraction
from sklearn.feature_extraction.text import CountVectorizer
# ngram_range parameter (1,2) means that unigram and bigram will be taken
# Count Vecotrizer automatically preprocess the tweets
vectorizer = CountVectorizer(ngram_range=(1,2),min_df =5)
TempVector=vectorizer.fit(tweets_train)

In [343]:
# N grams continued
Vocab_ngrams=TempVector.get_feature_names()
print(len(Vocab_ngrams))
vectorizer2 = CountVectorizer(ngram_range=(1,2), vocabulary=Vocab_ngrams)
ngrams=vectorizer2.fit_transform(tweets_train)
x=np.transpose(ngrams.toarray())
x.shape

820


(820, 1537)

In [344]:
# Adding N grams to dataframe
vocab=vectorizer2.get_feature_names()
for i in range(len(vocab)):
    merged['gram-'+vocab[i]]=x[i]

In [345]:
# Bonus features
# identifying sentiment in tweets expressed by slang words using Slang Sentiment Dictionary (SlangSD)
slangs= pd.read_csv("./lexicons/SlangSD/SlangSD.txt", sep='\t', names=['word','score'], header=None)
slangScore=np.zeros(len(tweets_train2))
dic_slangs={}
for i in (range(len(slangs['word']))):
    dic_slangs[slangs['word'][i]]=slangs['score'][i]
# print(dic_slangs)
for i in (range(len(tweets_train2))):
    tweet=word_tokenize(tweets_train2[i])
    for j in range(len(tweet)):
        if(tweet[j] in dic_slangs):
# #             print(i,dic_emoji[i])
            slangScore[i]+=dic_slangs[tweet[j]]
# for i in slangScore:
#     print(i,end='  ')
# print(tweets_train2)

# Count of capitilized words
capCount=np.zeros(len(tweets_train2))
original_tweets=merged['tweet']
for i in (range(len(original_tweets))):
    tweet=word_tokenize(original_tweets[i])
    for j in range(len(tweet)):
        if(tweet[j].isupper()):
            capCount[i]+=1
# for i in capCount:
#     print(i,end='  ')

# Adding features into dataframe
merged['slangScore']=slangScore
# merged['capCount']=capCount
cols=cols+['slangScore','capCount']
# merged


In [346]:
#Spltting features and y columns
y=merged['intensity']
x=merged.iloc[:,4:]
df_out=merged[["id", "tweet", "emotion", "intensity"]]
# x=merged[x.columns]

In [347]:
#Separating train and test set before training
y_train=y.iloc[:823]
y_test=y.iloc[823:]
x_train=x.iloc[:823]
x_test=x.iloc[823:]
df_out=df_out[823:]
# print(len(y_train))
# print(len(y_test))

In [348]:
# Running Decison Tree regressor
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor   
# create a regressor object 
regressor = DecisionTreeRegressor(random_state = 0)  
# fit the regressor with X and Y data 
regressor.fit(x_train, y_train) 
y_pred=regressor.predict(x_test)
print("MSE for DT=",mean_squared_error(y_test, y_pred))
print("MAE for DT=",mean_absolute_error(y_test, y_pred))

MSE for DT= 0.05419594072082166
MAE for DT= 0.1839493557422969


In [349]:
# Creating output file for model results
df_out['intensity']=y_pred
df_out.to_csv('dt_joy.txt',sep='\t',header=False,index=False)

In [350]:
# Runing SVM
from sklearn.svm import SVR
SVRregr=SVR()
SVRregr.fit(x_train,y_train)
y_pred=SVRregr.predict(x_test)
print("MSE for SVM=",mean_squared_error(y_test, y_pred))
print("MAE for SVM=",mean_absolute_error(y_test, y_pred))

MSE for SVM= 0.028213143180404944
MAE for SVM= 0.13397292644048686


In [351]:
# Creating output file for model results
df_out['intensity']=y_pred
df_out.to_csv('SVM_joy.txt',sep='\t',header=False,index=False)

In [352]:
#Running MLP
from sklearn.neural_network import MLPRegressor
MLP = MLPRegressor()
MLP.fit(x_train, y_train)
y_pred=MLP.predict(x_test)
print("MSE for MLP=",mean_squared_error(y_test, y_pred))
print("MAE for MLP=",mean_absolute_error(y_test, y_pred))

MSE for MLP= 0.0406824680358146
MAE for MLP= 0.16176728440578847


In [353]:
# Creating output file for model results
df_out['intensity']=y_pred
df_out.to_csv('MLP_joy.txt',sep='\t',header=False,index=False)

In [354]:
# Creating output file for all the features
merged.to_csv('Features_joy.txt',header=True,index=False)