In [33]:
import numpy as np
import pandas as pd

# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

In [34]:
# Instantiates a client
client = language.LanguageServiceClient()

In [99]:
# Files
FILENAME_A = 'data/semeval_train_A.txt'
FILENAME_B = 'data/semeval_train_B.txt'
FILENAME_C = 'data/semeval_train_C.txt'

INDEX_A = ['SentimentText', 'Sentiment']
INDEX_B = ['SentimentText', 'topic', 'Sentiment']
INDEX_C = ['ttext', 'topic', 'point']

In [109]:
def ingest(filename, index_names):
    df = pd.read_csv(filename, sep="\t", header=None, names=index_names)
    print("============================== Preview ==============================")
    print(df.head(5))
    print("============================== Summary ==============================")
    print(df.describe())
    print("=====================================================================")
    return df

# analyize text
def analyize_text(text, type_text):
    document = types.Document(content=text,
        type=type_text)
    return document

def analyize_sentiment(document):
    return client.analyze_sentiment(document=document).document_sentiment

def score_to_sentiment(sentiment):
    score = sentiment.score
    res = ""
    if score < -0.25:
        res = "negative" 
    elif score >= -0.25 and score < 0.25:
        res = "neutral"
    else:
        res = "positive"
    return res

def rounding_score(sentiment):
    return int(round(2 * sentiment.score))

def analyize_sentiment1(data):
    def make_sentiment(text):
        return score_to_sentiment(analyize_sentiment(analyize_text(text, enums.Document.Type.PLAIN_TEXT)))
    def compare_predict(row):
        return 1 if row.Sentiment == row.predic_sentiment else 0
    
    data['predic_sentiment'] = data.apply(lambda row: make_sentiment(row['SentimentText']), axis=1)
    data['is_correct'] = data.apply(lambda row: compare_predict(row), axis=1)
    print("accuracy", data['is_correct'].mean())
    data = data.drop('is_correct', axis=1)
    return data

def analyize_sentiment2(data):
    def make_sentiment(text):
        return 2 * (analyize_sentiment(analyize_text(text, enums.Document.Type.PLAIN_TEXT))).score
    def squared_error(row):
        return (row.point - row.predic_point) ** 2
    
    data['predic_point'] = data.apply(lambda row: make_sentiment(row['ttext']), axis=1)
    data['SE'] = data.apply(lambda row: squared_error(row), axis=1)
    print("MSE", data['SE'].mean())
    #data = data.drop('SE', axis=1)
    return data

In [97]:
df = pd.read_csv(FILENAME_A, sep="\t", header=None, names=INDEX_A).head(100)
df = analyize_sentiment1(df)

accuracy 0.57


In [100]:
df2 = pd.read_csv(FILENAME_B, sep="\t", header=None, names=INDEX_B).head(100)
df2 = analyize_sentiment1(df2)

accuracy 0.55


In [112]:
df3 = pd.read_csv(FILENAME_C, sep="\t", header=None, names=INDEX_C).head(100)
df3 = analyize_sentiment2(df3)

MSE 0.833600001478


In [113]:
df3

Unnamed: 0,ttext,topic,point,predic_point,SE
0,05 Beat it - Michael Jackson - Thriller (25th ...,michael jackson,0,1.4,1.96
1,Jay Z joins Instagram with nostalgic tribute t...,michael jackson,1,1.2,0.04
2,Michael Jackson: Bad 25th Anniversary Edition ...,michael jackson,0,1.2,1.44
3,18th anniv of Princess Diana's death. I still ...,michael jackson,1,0.0,1.00
4,@oridaganjazz The 1st time I heard Michael Jac...,michael jackson,2,0.4,2.56
5,Are you old enough to remember Michael Jackson...,michael jackson,2,0.4,2.56
6,The Weeknd is the closest thing we may get to ...,michael jackson,0,1.0,1.00
7,All of you people who're saying The Weekend is...,michael jackson,0,0.4,0.16
8,@MariahCarey may he R.I.P. Happy Birthday Mich...,michael jackson,1,0.6,0.16
9,I just may be actually done with this award sh...,michael jackson,1,0.0,1.00
