### Vader package

In [1]:
import re

import pandas as pd
import nltk

In [2]:
# read in comments
comments = pd.read_csv('./data/raw/Comments.csv')

In [3]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/michaelchung/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer, VaderConstants

sid = SentimentIntensityAnalyzer()

In [5]:
df = comments[comments.Key<101]
df.shape

(70142, 4)

In [6]:
df.Reply = df.Reply.apply(lambda x: re.sub(r'\(?http\S+', '', x))
df.Reply = df.Reply.apply(lambda x: ' '.join(nltk.word_tokenize(x)))


In [7]:
df[['neg','neu','pos','compound']] = df.Reply.apply(lambda x: pd.Series(sid.polarity_scores(x)))


In [8]:
df.head(3)

Unnamed: 0,Reply,Upvote,Time,Key,neg,neu,pos,compound
0,All we need now is humanless bodies,964,2018-01-30 11:19:01,1,0.0,1.0,0.0,0.0
1,For a moment I read careless driver . Would ha...,3982,2018-01-30 10:35:31,1,0.195,0.547,0.258,0.2023
2,Elon 's Twitter feed has been hilarious over t...,1765,2018-01-30 11:21:32,1,0.083,0.833,0.083,0.0


### Textblob package

In [9]:
from textblob.sentiments import NaiveBayesAnalyzer, PatternAnalyzer
from textblob import Blobber
tba = Blobber(analyzer=NaiveBayesAnalyzer())

In [10]:
df = comments[comments.Key<101]


In [11]:
df.Reply = df.Reply.apply(lambda x: re.sub(r'\(?http\S+', '', x))
df.Reply = df.Reply.apply(lambda x: ' '.join(nltk.word_tokenize(x)))


In [12]:
df[['Classification','positive','negative']] = df.Reply.apply(lambda x: pd.Series({'Classification':tba(x).sentiment[0],
                                                                        'positive':tba(x).sentiment[1],
                                                                        'negative':tba(x).sentiment[2]}))


In [13]:
df.head(3)

Unnamed: 0,Reply,Upvote,Time,Key,Classification,positive,negative
0,All we need now is humanless bodies,964,2018-01-30 11:19:01,1,pos,0.514741,0.485259
1,For a moment I read careless driver . Would ha...,3982,2018-01-30 10:35:31,1,pos,0.521948,0.478052
2,Elon 's Twitter feed has been hilarious over t...,1765,2018-01-30 11:21:32,1,pos,0.7385,0.2615


In [14]:
comments.dropna(subset=['Reply'], inplace=True)

In [15]:
# remove punctuation
comments.Reply = comments.Reply.apply(lambda x: re.sub(r'[^\w\s]', '', x) )

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michaelchung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# remove stopwords
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
comments.Reply = comments.Reply.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

In [18]:
# remove links
comments.Reply = comments.Reply.apply(lambda x: re.sub(r'\(?http\S+', '', x))

In [19]:
# stemming and lemmatization
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

In [20]:
text = nltk.word_tokenize('women run running runs ran')
stemmed = [porter_stemmer.stem(word) for word in text]
stemmed

['women', 'run', 'run', 'run', 'ran']

### Training own classifier

In [21]:
data = pd.read_csv('./sentiment-analysis-msa-phase-2/train.csv')

In [22]:
data.head(1)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral


In [23]:
data.dropna(subset=['text'], inplace=True)


In [24]:
# remove punctuation
data.text = data.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) )

In [25]:
# Remove stopwords
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
data.text = data.text.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

In [26]:
# remove links
data.text = data.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))


In [27]:
# Stemming and Lemmatization
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

lemmatizer = nltk.WordNetLemmatizer()


In [28]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michaelchung/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [29]:
data.text = data.text.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]))

**Training naive bayes sentiment classifier**

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics

In [31]:
vectorizer = CountVectorizer()

In [32]:
data.dropna(subset=['text'], inplace=True)
X_train = vectorizer.fit_transform(data.text)

In [33]:
# instantiate multinomial naive bayes model
nb = MultinomialNB()

nb.fit(X_train, data.sentiment)

MultinomialNB()

**Transform test data**

In [34]:
test = pd.read_csv('./sentiment-analysis-msa-phase-2/test.csv')

In [35]:
# drop NA text rows
test.dropna(subset=['text'], inplace=True)

In [36]:
# remove punctuation
test.text = test.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) )

In [37]:
# remove stopwords
test.text = test.text.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))


In [38]:
# remove links
test.text = test.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))


In [39]:
# lemmatization
test.text = test.text.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]))

In [40]:
test.dropna(subset=['text'], inplace=True)
X_test = vectorizer.transform(test.text)

In [41]:
X_test.shape

(3534, 26934)

In [42]:
pred_y = nb.predict(X_test)

In [43]:
pred_y.shape

(3534,)

In [44]:
pred_df = pd.DataFrame(data=pred_y.flatten())
pred_df

Unnamed: 0,0
0,positive
1,positive
2,negative
3,positive
4,neutral
...,...
3529,negative
3530,neutral
3531,negative
3532,positive


In [45]:
preds = pd.concat([test['textID'], pred_df[0]],axis=1,keys=['textID','sentiment'])
preds

Unnamed: 0,textID,sentiment
0,f87dea47db,positive
1,96d74cb729,positive
2,eee518ae67,negative
3,01082688c6,positive
4,33987a8ee5,neutral
...,...,...
3529,e5f0e6ef4b,negative
3530,416863ce47,neutral
3531,6332da480c,negative
3532,df1baec676,positive


In [46]:
preds.to_csv('./data/raw/predictions.csv', index=False)