# Data fetching and pre processing

In [80]:
import psycopg2

In [81]:
con = psycopg2.connect(database="twitter", host="localhost", port="5432", user="postgres")

In [82]:
print(con)

<connection object at 0x12e488180; dsn: 'dbname=twitter user=postgres host=localhost port=5432', closed: 0>


In [83]:
cursor = con.cursor()

In [90]:
get_data = "SELECT * from public.bangalore_tweets limit 1000"

In [91]:
cursor.execute(get_data)

In [92]:
resultset = cursor.fetchall()

In [93]:
tweet_list = []

In [94]:
import json

In [95]:
for row in resultset:
    tweet = {}
    json_tweet = json.loads(json.dumps(row[2][0]))    
    if json_tweet.has_key("contributors"):
        tweet['text'] = json_tweet["text"]
    elif json_tweet.has_key("retweeted_status"):
        tweet['text'] = json_tweet["retweeted_status"]["text"]
    else:
        continue
    tweet['text'] = row[2][0]['text']
    tweet['trend'] = row[3]
    tweet['date'] = row[4]
    tweet['hashtags'] = [ht['text'] for ht in row[2][0]['entities']['hashtags']]
    
    tweet_list.append(tweet)

In [96]:
len(tweet_list)

1000

In [97]:
print tweet_list[0],tweet_list[1]

{'trend': 'NaMo', 'text': u'#elections2014 yash for #NaMo http://t.co/v7W39Gzy9x', 'hashtags': [u'elections2014', u'NaMo'], 'date': datetime.date(2014, 4, 17)} {'trend': 'Karnataka', 'text': u'#Karnataka Jayapal is new Chief Executive of CEMILAC: P. Jayapal has taken charge as the new Chief Executive of... http://t.co/OR4aa9mrhK', 'hashtags': [u'Karnataka'], 'date': datetime.date(2014, 4, 17)}


In [98]:
import nltk
import re
from nltk.corpus import stopwords

In [99]:
def cleaner(sentence):
    words = sentence.split()
    word_list = [re.sub('http.*', '', word) for word in words if not word in stopwords.words("english")]
    return ' '.join(word_list)

In [100]:
def join_lines(sentence):
    sentence = re.sub('\n','.',sentence)
    return sentence.encode('ascii','ignore')
    

In [101]:
for tweet in tweet_list:
    try:
        tweet['text'] = join_lines(tweet['text'])
        tweet['clean_text'] = cleaner(tweet['text'])
    except Exception as e:
        continue

In [102]:
tweet_list[0]

{'clean_text': '#elections2014 yash #NaMo ',
 'date': datetime.date(2014, 4, 17),
 'hashtags': [u'elections2014', u'NaMo'],
 'text': '#elections2014 yash for #NaMo http://t.co/v7W39Gzy9x',
 'trend': 'NaMo'}

In [103]:
import pandas as pd

In [104]:
df = pd.DataFrame(tweet_list)

In [105]:
df.shape

(1000, 5)

In [106]:
import csv

In [107]:
df.to_csv("tweets_mini.csv",encoding='utf-8',quoting=csv.QUOTE_MINIMAL,doublequote=False,escapechar="\\",sep="\t")

In [108]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import csv

In [109]:
tweets_df = pd.read_csv("tweets_mini.csv",sep="\t",encoding='utf-8')

# Calculating TFIDF 

In [110]:
vectorizer = TfidfVectorizer(decode_error='ignore', norm='l2')
X = vectorizer.fit_transform(df['clean_text'])

In [111]:
column_labels = vectorizer.get_feature_names()

In [112]:
from scipy.sparse import csc_matrix

In [113]:
column_matrix = csc_matrix(X)

In [114]:
tfidf_matrix = pd.DataFrame(column_matrix.toarray(),columns=column_labels)

In [118]:
yuvraj = tfidf_matrix['yuvrajsingh'].copy()
yuv = pd.Series(yuvraj, copy=True)
x = yuv.sort(ascending=False)

In [119]:
column_max_list = []

In [120]:
for i,column in tfidf_matrix.iteritems():
    column_max_list.append(column.max())
    

In [121]:
column_max_df = pd.DataFrame([column_max_list,column_labels])

In [122]:
column_max_df = column_max_df.transpose()

In [123]:
tfidf_sorted = column_max_df.sort(columns=[0],ascending=False)

In [124]:
tfidf_sorted[0][tfidf_sorted[0] == 1] = 0.99


In [126]:
top_tfidf = tfidf_sorted[:50]

# Insert term and tweets into DB

In [127]:
for term in top_tfidf[1]:
    related_tweet_list = []
    for tweet in tweet_list:
        if tweet['clean_text'].find(term) != -1: 
            related_tweet_list.append(tweet['text'])
            
    query = "INSERT INTO TERM_TWEETS VALUES(%s,%s)"
    cursor.execute(query,(term,related_tweet_list))
    con.commit()
            

# Building Sentiment Analyser model

In [128]:
import os

In [129]:
NEG_DIRECTORY = os.path.join('twitter_neg')
POS_DIRECTORY = os.path.join('twitter_pos')

In [130]:
pos_reviews = []
neg_reviews = []

for pos_file in os.listdir(POS_DIRECTORY):
    fileName = os.path.join(POS_DIRECTORY,pos_file)
    with open(fileName, 'r') as posFile:
        pos_reviews.extend(posFile.readlines())
        
for neg_file in os.listdir(NEG_DIRECTORY):
    fileName = os.path.join(NEG_DIRECTORY,neg_file)
    with open(fileName, 'r') as negFile:
        neg_reviews.extend(negFile.readlines())
        

        


In [131]:
import numpy as np

In [132]:
#building a numpy array by setting postive as 1 and negative as 0
concatenated_array = np.concatenate((np.ones(len(pos_reviews)),np.zeros(len(neg_reviews))))

In [133]:
pos_reviews_cleaned = []
neg_reviews_cleaned = []

#Do some very minor text preprocessing
def cleanText(corpus):
    punctuation = """.,?!:;(){}[]"""
    corpus = [z.lower().replace('\n','') for z in corpus]
    corpus = [z.replace('<br />', ' ') for z in corpus]

    #treat punctuation as individual words
    for c in punctuation:
        corpus = [z.replace(c, ' %s '%c) for z in corpus]
    corpus = [z.split() for z in corpus]
    return corpus

pos_reviews_cleaned = cleanText(pos_reviews)
neg_reviews_cleaned = cleanText(neg_reviews)

 
    

In [134]:
from sklearn.cross_validation import train_test_split
x_train,x_test,pos_neg_train,pos_neg_test = train_test_split(np.concatenate((pos_reviews_cleaned,neg_reviews_cleaned)),concatenated_array,test_size=0.2)

In [135]:
import gensim
LabeledSentence = gensim.models.doc2vec.LabeledSentence

In [136]:
def labelizeReviews(reviews,label_type):
    labelized = []
    for i,v in enumerate(reviews):
        if len(v) ==0:
            continue
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v,[label]))
    return labelized

    

In [139]:
x_train_label = labelizeReviews(x_train,'TRAIN')
x_test_label = labelizeReviews(x_test,'TEST')
unsup_reviews = []
for tweet in tweet_list:
    unsup_reviews.append(tweet['clean_text'])

unsup_lable_reviews = labelizeReviews(unsup_reviews,'UNSUP')

# We have labelized reviews, now running DBOW/DM to build the model"

In [140]:
import random

In [141]:
size = 300

model_dm = gensim.models.Doc2Vec(min_count=1,window=10,size=size,sample=1e-3,negative=5,workers=4)
model_dbow = gensim.models.Doc2Vec(min_count=1,window=10,size=size,sample=1e-3,negative=5,workers=4,dm=0)

In [142]:
model_dm.build_vocab(np.concatenate((x_train_label,x_test_label,unsup_lable_reviews)))
model_dbow.build_vocab(np.concatenate((x_train_label,x_test_label,unsup_lable_reviews)))

In [143]:
all_train_reviews = np.concatenate((x_train_label,unsup_lable_reviews))

In [144]:
all_train_reviews.shape[0]

9530

In [145]:
for epoch in range(10):
    perm = np.random.permutation(all_train_reviews.shape[0])
    model_dm.train(all_train_reviews[perm])
    model_dbow.train(all_train_reviews[perm])

In [146]:
def getVecs(model,corpus,size):
    vecs = [np.array(model[z.labels[0]]).reshape((1,size)) for z in corpus]
    return np.concatenate(vecs)

In [148]:
train_vecs_dm = getVecs(model_dm,x_train_label,size)
train_vecs_dbow = getVecs(model_dbow,x_train_label,size)

In [149]:
train_vecs = np.hstack((train_vecs_dm,train_vecs_dbow))

In [150]:
train_vecs.shape

(8530, 600)

In [151]:
x_test_label = np.array(x_test_label)

In [152]:
for epoch in range(10):
    perm = np.random.permutation(x_test_label.shape[0])
    model_dm.train(x_test_label[perm])
    model_dbow.train(x_test_label[perm])
    

In [153]:
test_vecs_dm = getVecs(model_dm,x_test_label,size)
test_vecs_dbow = getVecs(model_dbow,x_test_label,size)

In [154]:
test_vecs = np.hstack((test_vecs_dm,test_vecs_dbow))

In [155]:
pos_neg_test.shape

(2133,)

# We have all the vectors now, we have to train the classifer

In [156]:
from sklearn.linear_model import SGDClassifier

In [157]:
lrl1 = SGDClassifier(loss='log',penalty='l1')
lrl1.fit(train_vecs,pos_neg_train)


SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='log', n_iter=5, n_jobs=1, penalty='l1', power_t=0.5,
       random_state=None, shuffle=False, verbose=0, warm_start=False)

In [158]:
print 'Test Accuracy : %.2f' %lrl1.score(test_vecs,pos_neg_test)

Test Accuracy : 0.62


In [159]:
lrl2 = SGDClassifier(loss='log',penalty='l2')
lrl2.fit(train_vecs,pos_neg_train)
print 'Test Accuracy : %.2f' %lrl2.score(test_vecs,pos_neg_test)

Test Accuracy : 0.61


In [160]:
from sklearn.ensemble import RandomForestClassifier

In [161]:
rfc = RandomForestClassifier(n_estimators=20)
rfc.fit(train_vecs,pos_neg_train)
print 'Test Accuracy: %.2f' %rfc.score(test_vecs,pos_neg_test)

Test Accuracy: 0.52


# Vectorizing Data and Predict

In [162]:
unsup_dm = getVecs(model_dm,unsup_lable_reviews,size)
unsup_dbow = getVecs(model_dbow, unsup_lable_reviews, size)

In [163]:
unsup_vecs = np.hstack((unsup_dm, unsup_dbow))

In [164]:
unsup_label_predict = lrl2.predict(unsup_vecs)

In [165]:
tweets = []
clear_tweets = []
for tweet in tweet_list:
    tweets.append(tweet['text'])
    clear_tweets.append(tweet['clean_text'])

In [166]:
predict_df = pd.DataFrame({'tweet':tweets,'prediction':unsup_label_predict})

In [167]:
def buildLabelVectorAndPredict(tweets):
    test_vector_lable_reviews = labelizeReviews(tweets[0],'UNSUP')
    test_vector_dm = getVecs(model_dm,test_vector_lable_reviews,size)
    test_vector_dbow = getVecs(model_dbow, test_vector_lable_reviews, size)
    test_vector_set = np.hstack((test_vector_dm,test_vector_dbow))
    return lrl2.predict(test_vector_set)
    

In [168]:
trend_sentiment = {}
for trend in top_tfidf[1]:
    getQuery = "SELECT tweets from term_tweets where term=%s"
    cursor.execute(getQuery,(trend,))
    related_tweets = cursor.fetchone()
    if len(related_tweets[0]) < 5:
        continue
    predicted_score = buildLabelVectorAndPredict(related_tweets)
    df_predicted_result = pd.DataFrame(predicted_score, columns=['predict'])
    x = df_predicted_result.predict.value_counts()
    trend_sentiment[trend] = float(1*x[1])/ float(x[0]+x[1])



In [169]:
final_df = pd.DataFrame.from_dict(trend_sentiment,orient="index")

In [170]:
final_df.sort(columns=[0],ascending=False)

Unnamed: 0,0
he,0.979827
lore,0.972028
go,0.954545
ls,0.945205
now,0.931034
elect,0.931034
voted,0.928571
film,0.925926
kannakeepcalm,0.923077
rm,0.918367
