In [2]:
import csv
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [3]:
def read_data(filename='text_emotion.csv'):
    X = []
    y = []
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        next(reader, None)
        for i,data in enumerate(reader):
            y.append(data[1])
            X.append(data[3])
    
    #Process the texts a bit.   
    #remove punctuation and make lower.
    #X = [x.lower() for x in X]
    
    return X, y

In [4]:
def combine_data(X, y):
  new_X, new_y = [], []
  for data in zip(X, y):
    if (data[1] == 'enthusiasm' or data[1] == 'surprise' or data[1] == 'love' or data[1] == 'fun' 
        or data[1] == 'happiness' or data[1] == 'relief'):
      new_X.append(data[0])
      new_y.append('happy')
    elif data[1] == 'boredom' or data[1] == 'neutral' or data[1] == 'empty':
      new_X.append(data[0])
      new_y.append('neutral')
    elif data[1] == 'sadness' or data[1] == 'worry':
      new_X.append(data[0])
      new_y.append('sad')
    else:
      new_X.append(data[0])
      new_y.append(data[1])
  return new_X, new_y
    

In [11]:
X, y = read_data()
# Combine some of the labels together
X_5, y_5 = combine_data(X, y)
# Split the data into test and train
# X_train, X_test, y_train, y_test = train_test_split(X_5, y_5, test_size=0.33, random_state=42)
# X13_train, X13_test, y13_train, y13_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
def train_classifier(X, y):
    cls = LogisticRegression(multi_class='multinomial', solver='newton-cg')
    cls.fit(X, y)
    return cls
def evaluate(X, yt, cls):
    yp = cls.predict(X)
    acc = metrics.accuracy_score(yt, yp)
    return acc

In [13]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
tfidf.fit(X)
X_train = tfidf.transform(X)
# X_test = tfidf.transform(X_test)
X_train5 = tfidf.transform(X_5)

In [14]:
len(X_5)

40000

In [15]:
cls_5 = train_classifier(X_train5, y_5)

In [31]:
dummy_sent = "I am so happy. I love life. Unicorn and rainbows"

# cls_5
# tfidf

# Get the vector encoding
sent_vec = tfidf.transform([dummy_sent])

pred = cls_5.predict(sent_vec)[0]

weights = {}

for index,rep in zip(sent_vec.indices,
        tfidf.inverse_transform(sent_vec)[0]):
    class_weights = []
    num_classes = len(cls_5.classes_)
    for i in range(num_classes):
        class_weights.append(cls_5.coef_[i][index])
    weights[str(rep)] = class_weights
                                                               
print(sent_vec)
print(pred)
print(weights)

  (0, 250178)	0.42424824163067487
  (0, 211783)	0.2997900508254158
  (0, 211477)	0.13345411573051646
  (0, 191172)	0.42424824163067487
  (0, 139769)	0.37550408840311744
  (0, 139567)	0.1650898023631214
  (0, 134154)	0.2202954380216786
  (0, 99237)	0.40847187063705415
  (0, 99125)	0.1681461297497927
  (0, 14469)	0.10846722875622156
  (0, 12417)	0.2792840497897836
  (0, 12069)	0.17512175727111215
happy
{'unicorn': [-0.0008137249193864325, -0.0663366584526803, -0.00961673839648458, 0.12500319972352894, -0.0482360779549776], 'so happy': [-0.015375264264068832, 1.1845504875791746, 0.012114244537194408, -0.6444122960783951, -0.5368771717739048], 'so': [0.030820358223296415, 0.4271256907788574, 1.2808979936528215, -1.9092149862854606, 0.17037094363047459], 'rainbows': [-0.0007751144821914532, -0.04426838170316526, -0.009399509469283522, 0.10813095828361487, -0.053687952628974654], 'love life': [0.22031931228345703, -0.05020169081102888, -0.019896624815323827, -0.11875087462787952, -0.03147012

In [16]:
acc = evaluate(X_train5, y_5, cls_5)
acc

0.85445

In [17]:
cats = set()
for label in y_train:
  cats.add(label)
cats

NameError: name 'y_train' is not defined

In [18]:
len(cats)

0

In [19]:
cls_13 = train_classifier(X_train, y)

In [20]:
acc = evaluate(X_train, y, cls_13)
print(acc)

0.656025


In [21]:
cats = set()
for label in y13_train:
  cats.add(label)
print(cats)
print(len(cats))

NameError: name 'y13_train' is not defined

In [None]:
filename = 'finalized_model_13.pkl'
pickle.dump(cls_13, open(filename, 'wb'))
filename = 'finalized_model_5.pkl'
pickle.dump(cls_5, open(filename, 'wb'))
filename = 'sentiment_weights_model2.pkl'
pickle.dump(tfidf, open(filename, 'wb'))

In [None]:
# visualize the data
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
c = dict(Counter(y_5))
x = c.keys()
vals = c.values()
x_range = [i+1 for i in range(len(x))]

In [None]:
plt.bar(x_range, vals, color='red')
plt.xlabel('category')
plt.ylabel('counts')
plt.title('Counts of the categories')
plt.xticks(x_range, x, rotation='vertical')
plt.show()