In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.spatial import distance

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score


In [None]:
# Category -> words
df = pd.read_table('finalDataSetYork.csv',
                   sep=',', encoding='utf-8', names = ["word", "sentiment"])
df.head(100)
df.shape




In [None]:
# Load the whole embedding matrix

embeddings_index = {}
f=open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    
    embed = np.array(values[1:])
    embed = ','.join(embed)
    embed = np.fromstring( embed, dtype=np.float, sep=',' )
    #print(embed)
    #dtype=np.float32
    #numpy. ndarray. astype(dtype)
    embeddings_index[word] = embed
print('Loaded %s word vectors.' % len(embeddings_index))


In [None]:
# Embeddings for available words


import numpy
# Embeddings for available words
data_embeddings ={}
unknown_words =[]
count= 0
for word in df["word"].tolist():
    if(word.lower() in embeddings_index.keys()):
        count = count +1
        if(word.lower() not in data_embeddings.keys()):
            data_embeddings[word.lower()] = embeddings_index[word.lower()]
        else:
          print(word.lower())
    else:
        unknown_words.append(word)
#data_embed_values = list(data_embeddings.values())

#print(data_embeddings)
print('Key Categories %s word vectors.' % len(df["word"]))
print('Final Embedding %s word vectors.' % len(data_embeddings))


In [None]:
import pickle

with open('data_embedding.pickle', 'wb') as handle:
    pickle.dump(data_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

    
    
with open('embeddings_index.pickle', 'wb') as handle:
    pickle.dump(embeddings_index, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
import pickle
with open('data_embedding.pickle', 'rb') as handle:
   data_embeddings = pickle.load(handle)

with open('embeddings_index.pickle', 'rb') as handle:
    embeddings_index = pickle.load(handle)


In [None]:
import operator
from scipy.spatial import distance
# Processing the query
unknownWords = []
def predictions(query):
  return_val = "unknown"
  try:
      query_embed = embeddings_index[query]
  except:
      print(query, " - unknown")
      return_val = "unknown"
  else:
      #print(query_embed)
      scores = {}
      score_counts = {}
     
      for word, embed in data_embeddings.items():
        category_df = df[df["word"]==word]
        #print(category_df['sentiment'].values[0])
        category =category_df['sentiment'].values[0]
      
        dist = distance.euclidean(query_embed, embed)
        #dist /= 6
        scores[category] = scores.get(category, 0) + dist
        score_counts[category] = score_counts.get(category, 0) + 1

      for category, dist in scores.items():
        dist = dist / score_counts[category]
        scores[category] = dist
        #print(scores)
        #print(max(scores.items(), key=operator.itemgetter(1))[0])
      #return scores
      if('unknown' in scores.keys()):
          del scores['unknown']
      return_val= min(scores.items(), key=operator.itemgetter(1))[0]
      #print("scores" , scores)
  return return_val

In [None]:
# Testing

#print(process('sad'))
#print(predictions('puneet1'))
print(predictions('ecstatic'))
print(predictions('gloomy'))
print(predictions('somber'))
print(predictions('dull'))
print(predictions('tedious'))
print(predictions('sick'))
print(predictions('cry'))
print(predictions('victory'))
print(predictions('kick'))
print(predictions('angry'))
print(predictions('high'))
print(predictions('climb'))
print(predictions('crime'))
print(predictions('high'))
print(predictions('grounded'))
print(predictions('flabbergasted'))



In [None]:
print(predictions('insulted'))

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['category_encoded'] = labelencoder.fit_transform(df['sentiment'])
category_df = pd.DataFrame(df['sentiment'].unique(), columns=['sentiment'])
category_df['emotion_category'] = labelencoder.fit_transform(category_df['sentiment'])
category_df

In [None]:
words = df['word'].tolist()
categories = df['category_encoded'].tolist()



In [None]:
predictions = []
for word in words:
    category = predictions(word)
    if(category=="unknown") :
        print(word)
    catdf = category_df[category_df['sentiment']==category]
    predictions.append(catdf['emotion_category'].values[0])
 

In [None]:
zip_List = list(zip(words, categories, predictions))
zip_List2 = list(zip(zip_List, predictions))


In [None]:
df_new = pd.DataFrame(zip_List, columns =['word', 'sentiment', 'prediction'])
df_new
df_new.to_excel("output_CheckIn_withEucledianDist_glove_840B_300d_finalDataSetYork1.xlsx") 

In [None]:
from sklearn import metrics
cm = metrics.confusion_matrix(categories, predictions)
pd.DataFrame(
    cm,
    index=[['actual', 'actual','actual','actual','actual','actual','actual', 'actual', 'actual','actual','actual','actual'], ['Angry', 'Anxiety/Stress', 'Fearful', 'Happy', 'Mixed/Unsure', 'Neutral', 'Other', 'Peaceful', 'Physical', 'Sad', 'Sleep', 'unknown']],
    columns=[['predicted', 'predicted','predicted','predicted','predicted','predicted','predicted', 'predicted', 'predicted','predicted','predicted','predicted'], ['Angry', 'Anxiety/Stress', 'Fearful', 'Happy', 'Mixed/Unsure', 'Neutral', 'Other', 'Peaceful', 'Physical', 'Sad', 'Sleep', 'unknown']]
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
def get_metrics(y_test, y_predicted):
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [None]:
accuracy, precision, recall, f1 = get_metrics(categories, predictions)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))