In [310]:
import psycopg2
import sys
import numpy as np
import pandas as pd
from nltk import pos_tag
from nltk import pos_tag_sents
from nltk.tokenize import TweetTokenizer
from textblob import TextBlob
from __future__ import division
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.externals import joblib
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

In [311]:
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
def stopword_filtered_word_feats(words):
    return [word for word in words if word not in stopset]

def extract_hash_tags(s):
    return set(part[1:] for part in s.split() if part.startswith('#'))

def extract_http(s):
    return set(part for part in s.split() if part.startswith('http'))

In [334]:
tknzr = TweetTokenizer(strip_handles=True)
analyzer = SentimentIntensityAnalyzer()

def get_features(tweet_str):
    twt_tkns = tknzr.tokenize(tweet_str)
    senti = analyzer.polarity_scores(' '.join(stopword_filtered_word_feats(twt_tkns)))

    VB_count = 0
    PRP_count = 0
    JJ_count = 0
    for word, tag in pos_tag(twt_tkns):
        if tag == 'VB':
            VB_count += 1
        if tag == 'PRP':
            PRP_count += 1
        if tag == 'JJ':
            JJ_count += 1
    return (senti['pos'], senti['neg'], senti['neu'], senti['compound'],
            len(twt_tkns), len(extract_hash_tags(tweet_str)), len(extract_http(tweet_str)),
            VB_count, PRP_count, JJ_count)
            
    

In [360]:
con = psycopg2.connect("dbname='tweetdat' user='andric'")
sql_query = """
            SELECT rt_tweet, rt_count/user_followers_count::float as rt_influence \
            FROM la_tweets \
            WHERE rt_status = 1 AND user_followers_count > 0 \
            ORDER BY rt_influence DESC LIMIT 13;
            """
query_results = pd.read_sql_query(sql_query, con)
if con:
    con.close()

In [361]:
rts_nodups = query_results['rt_tweet'].drop_duplicates()

In [362]:
popular_tweets = rts_nodups.apply(get_features).apply(pd.Series)

In [363]:
popular_tweets.iloc[:10, :]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,1.0,0.0,14,0,0,0,0,0
1,0.0,0.0,1.0,0.0,1,1,0,0,0,0
2,0.0,0.0,1.0,0.0,4,0,0,0,0,0
3,0.0,0.0,1.0,0.0,12,0,1,0,1,0
4,0.213,0.0,0.787,0.4019,17,0,0,1,1,1
6,0.0,0.0,1.0,0.0,6,0,1,0,0,1
7,0.404,0.0,0.596,0.7772,27,0,2,2,3,0
8,0.216,0.196,0.588,-0.2023,17,1,0,0,0,1
9,0.463,0.0,0.537,0.784,12,0,1,0,0,0
12,0.0,0.0,1.0,0.0,26,0,1,1,5,2


In [299]:
print query_results.iloc[66, 0].decode('utf-8')
print TextBlob(query_results.iloc[66, 0].decode('utf-8')).polarity
print analyzer.polarity_scores(query_results.iloc[66, 0].decode('utf-8'))
print analyzer.polarity_scores(query_results.iloc[66, 0])

When I grow up, I don't want to be as big as you, I want to be bigger -👌🏽
0.0
{'neg': 0.149, 'neu': 0.851, 'pos': 0.0, 'compound': -0.1139}
{'neg': 0.149, 'neu': 0.851, 'pos': 0.0, 'compound': -0.1139}


In [235]:
from nltk import pos_tag
from nltk import pos_tag_sents
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True)

In [300]:
#tknzr.tokenize(query_results.iloc[1, 0].decode('utf-8'))
tkns = tknzr.tokenize(query_results.iloc[66, 0].decode('utf-8'))
#pos_tag(tkns)
#pos_tag(tknzr.tokenize('@remy: This is waaaaayyyy too much for you!!!!!!'))

In [291]:
from collections import Counter

print len(tkns)
len(tkns) / sum(Counter(tkns).values())

21


1.0

In [304]:
print ' '.join(stopword_filtered_word_feats(tkns))
analyzer.polarity_scores(' '.join(stopword_filtered_word_feats(tkns)))

When I grow , I don't want big , I want bigger - � � � �


{'compound': -0.1139, 'neg': 0.214, 'neu': 0.786, 'pos': 0.0}

In [233]:
tkns

[u'My', u'first', u'cover', u'for', u'with', u'https://t.co/KJkBH1RNIK']

In [100]:
pos_tag(tkns)[1][1]

'DT'

In [60]:
query_results['rt_tweet'][4] == query_results['rt_tweet'][5]

True

In [9]:
analyzer = SentimentIntensityAnalyzer()

In [10]:
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    print("{:-<65} {}".format(sentence, str(vs)))

The plot was good, but the characters are uncompelling and the dialog is not great. {'neg': 0.327, 'neu': 0.579, 'pos': 0.094, 'compound': -0.7042}
A really bad, horrible book.------------------------------------- {'neg': 0.791, 'neu': 0.209, 'pos': 0.0, 'compound': -0.8211}
At least it isn't a horrible book.------------------------------- {'neg': 0.0, 'neu': 0.637, 'pos': 0.363, 'compound': 0.431}


In [172]:
vs['compound']

0.431

In [22]:
analyzer.polarity_scores("@LJFamous i don't really be announcing it sorry lo lmao")

{'compound': 0.5574, 'neg': 0.107, 'neu': 0.574, 'pos': 0.32}

In [26]:
help(analyzer.score_valence)

Help on method score_valence in module vaderSentiment.vaderSentiment:

score_valence(self, sentiments, text) method of vaderSentiment.vaderSentiment.SentimentIntensityAnalyzer instance

