In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import keras as ks

First we load our dataset

In [2]:
CUR_DIR = Path().absolute()
DATA_DIR = CUR_DIR / "sentiment labelled sentences"

df_list = []

for data_file in DATA_DIR.glob("*.txt"):
    if data_file.name.lower() == "readme.txt": continue
    
    df = pd.read_csv(data_file, names=['sentence', 'label'], sep='\t')
    df['source'] = data_file.name.split("_")[0]  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)

Just to verify, we show the data

In [3]:
df

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


We now try to use the "Bag of Words" way to extract features - we generate a way to represent presence of works as a vector.

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

feature_extractor = CountVectorizer(min_df=0.0)

feature_extractor.fit(df["sentence"])

feature_extractor.vocabulary_  # Shows us global frequency of words

{'so': 4161,
 'there': 4545,
 'is': 2427,
 'no': 3043,
 'way': 4987,
 'for': 1829,
 'me': 2809,
 'to': 4609,
 'plug': 3387,
 'it': 2432,
 'in': 2314,
 'here': 2158,
 'the': 4531,
 'us': 4835,
 'unless': 4793,
 'go': 1982,
 'by': 647,
 'converter': 1013,
 'good': 1993,
 'case': 713,
 'excellent': 1603,
 'value': 4861,
 'great': 2023,
 'jawbone': 2449,
 'tied': 4589,
 'charger': 768,
 'conversations': 1012,
 'lasting': 2567,
 'more': 2931,
 'than': 4526,
 '45': 45,
 'minutes': 2890,
 'major': 2753,
 'problems': 3482,
 'mic': 2868,
 'have': 2113,
 'jiggle': 2464,
 'get': 1954,
 'line': 2648,
 'up': 4821,
 'right': 3777,
 'decent': 1171,
 'volume': 4928,
 'if': 2282,
 'you': 5138,
 'several': 4004,
 'dozen': 1373,
 'or': 3144,
 'hundred': 2259,
 'contacts': 987,
 'then': 4542,
 'imagine': 2294,
 'fun': 1901,
 'of': 3097,
 'sending': 3968,
 'each': 1440,
 'them': 4538,
 'one': 3124,
 'are': 273,
 'razr': 3611,
 'owner': 3200,
 'must': 2980,
 'this': 4558,
 'needless': 3010,
 'say': 3889,
 '

So each sentence now has a 0-1 valued vector representing which of the words are used in the review.

Now, we try to see which words predict a positive review. First, we need to split the data into training and testing sets

In [5]:
from sklearn.model_selection import train_test_split

sentences_train, sentences_test, y_train, y_test = train_test_split(
    df["sentence"], df["label"], test_size=0.25, random_state=42069,
)

f"{len(sentences_train) = }, {len(sentences_test) = }"

'len(sentences_train) = 2061, len(sentences_test) = 687'

Now we try to train a simple logistic regression model

In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
x_train = feature_extractor.transform(sentences_train)

model.fit(x_train, y_train)

We show the scores for train and test sets

In [7]:
f"{model.score(x_train, y_train) = }"

'model.score(x_train, y_train) = 0.9805919456574479'

In [8]:
x_test = feature_extractor.transform(sentences_test)

f"{model.score(x_test, y_test) = }"

'model.score(x_test, y_test) = 0.8195050946142649'

In [9]:
i = np.eye(N=len(feature_extractor.vocabulary_))

probs = model.predict_proba(i)
_, prob_positive = zip(*probs)


classes = feature_extractor.inverse_transform(i)
classes = [str(c[0]) for c in classes]

prob_mappings = list(zip(classes, prob_positive))
prob_mappings

[('00', 0.43124711365982715),
 ('10', 0.6879812737532196),
 ('100', 0.4920992460384356),
 ('11', 0.40305887545310837),
 ('12', 0.39364227352152803),
 ('13', 0.3683906413842716),
 ('15', 0.6401632115862588),
 ('15g', 0.43124711365982715),
 ('15pm', 0.43124711365982715),
 ('17', 0.4020633263743111),
 ('18', 0.5012925903301431),
 ('18th', 0.4533950106932699),
 ('1928', 0.43124711365982715),
 ('1947', 0.4781027873996329),
 ('1948', 0.43124711365982715),
 ('1949', 0.4583565697588693),
 ('1971', 0.42420319907094617),
 ('1973', 0.46104392629704133),
 ('1979', 0.43124711365982715),
 ('1980', 0.4439429090669932),
 ('1986', 0.3928540790273433),
 ('1995', 0.43124711365982715),
 ('1998', 0.39275368208011907),
 ('20', 0.41185133280321756),
 ('2000', 0.43124711365982715),
 ('2005', 0.5258470700487837),
 ('2006', 0.43124711365982715),
 ('2007', 0.4472607970306439),
 ('20th', 0.4722879566356427),
 ('2160', 0.4013046780562655),
 ('23', 0.491012071091429),
 ('24', 0.45619059725140854),
 ('25', 0.4274175

In [10]:
coefficient_scores = sorted(prob_mappings, key=lambda x: x[1])

# Best 10 words indicating a good review
coefficient_scores[-10:]

[('works', 0.7958313153725932),
 ('delicious', 0.8026238688250893),
 ('best', 0.8030907111321334),
 ('fantastic', 0.8044556210045531),
 ('good', 0.8161418569674062),
 ('awesome', 0.8161835912257326),
 ('nice', 0.829244508998832),
 ('love', 0.8552618882576929),
 ('excellent', 0.8580505383729834),
 ('great', 0.9333362886643681)]

In [11]:
coefficient_scores[:10]

[('not', 0.06694448940774994),
 ('bad', 0.06757972712958861),
 ('worst', 0.09672428409304125),
 ('poor', 0.10004511026331316),
 ('terrible', 0.1374984907188718),
 ('awful', 0.13884099403773553),
 ('then', 0.1501592109160489),
 ('wasn', 0.16104780470019392),
 ('too', 0.1622993730619834),
 ('slow', 0.16698284468473112)]

Trying K means, just for fun

In [23]:
from sklearn.cluster import KMeans

lskmeans = KMeans(n_clusters=5)

predicted_clusters = lskmeans.fit_predict(x_train)


  super()._check_params_vs_input(X, default_n_init=10)


In [25]:
k_means_pred = pd.DataFrame(zip(sentences_train, predicted_clusters), columns=("sentence", "cluster"))
k_means_pred[k_means_pred["cluster"] == 0]

Unnamed: 0,sentence,cluster
0,Also the music by Mark Snow is possibly the be...,0
9,"Anyways, The food was definitely not filling a...",0
11,How this piece of trash was ever released is b...,0
15,This does not fit the Palm Tungsten E2 and it ...,0
34,We made the drive all the way from North Scott...,0
...,...,...
2046,"It was that year, however, that reminded us th...",0
2048,"Both films are terrible, but to the credit of ...",0
2050,The worst was the salmon sashimi.,0
2056,The lighting is just dark enough to set the mood.,0


In [30]:
k_means_pred[k_means_pred["cluster"] == 4]

Unnamed: 0,sentence,cluster
3,The seafood was fresh and generous in portion.,4
4,I came back today since they relocated and sti...,4
5,") some great music, and terrific scenery.",4
8,"The Wife hated her meal (coconut shrimp), and ...",4
16,"This is a bad film, with bad writing, and good...",4
...,...,...
2047,its extremely slow and takes forever to do any...,4
2049,If you want to wait for mediocre food and down...,4
2052,We started with the tuna sashimi which was bro...,4
2059,Used and dirty.,4


We do not know anything, since the data is a sparse vector

In [77]:
from sklearn.neural_network import MLPClassifier

AI = MLPClassifier(hidden_layer_sizes=(3,))

AI.fit(x_train, y_train)
AI.score(x_train, y_train)



0.9985443959243085

In [78]:
AI.score(x_test, y_test)

0.8238719068413392

In [79]:
AI.coefs_

[array([[ 2.19744092e-50, -4.83731659e-50,  1.08158246e-47],
        [ 3.07071617e-01, -2.36103994e-01,  3.61637491e-01],
        [ 1.38842951e-01, -7.71550277e-02,  1.60716899e-01],
        ...,
        [-1.11452372e-48, -7.24957798e-47,  7.28591438e-50],
        [ 2.28990576e-47, -8.22544302e-54, -1.96282303e-49],
        [-1.69884650e-01,  1.45366963e-01, -1.85200989e-01]]),
 array([[ 2.36011533],
        [-1.69531718],
        [ 2.13886606]])]

In [85]:
neuron_coeffs = list(zip(*AI.coefs_[0]))
neuron_coeffs

[(2.1974409202542506e-50,
  0.3070716165478829,
  0.13884295072072408,
  -0.19223227523852654,
  -0.058165976879231376,
  -0.31821214631406003,
  0.7335615171410247,
  -1.8505973181204747e-47,
  -7.379235661734083e-48,
  -0.1050927904581691,
  0.2965726757954303,
  0.11302657428232242,
  -9.000264905088897e-47,
  0.2096679830891569,
  2.0727137798703734e-49,
  0.14995412167304867,
  -0.07125555650610241,
  0.12292134754712888,
  -6.486538831283699e-50,
  0.07194678525179592,
  -0.187838904327053,
  6.946170542691502e-50,
  -0.11965344258386003,
  -0.12838066537943696,
  -2.334830289235548e-50,
  0.3690267669745461,
  -2.6415148002036304e-47,
  0.13282559981925254,
  0.12859948687233286,
  -0.16966640675569103,
  0.22680021996940639,
  0.1773452225451837,
  0.025948605620973755,
  0.04554422859139902,
  -0.15618778219686294,
  -0.1980175387710731,
  0.08449239709869946,
  -0.22323688271952358,
  0.09560011731069114,
  -0.08016829611308321,
  -0.22385565344973987,
  0.1122967311501042,
 

In [90]:
sorted(zip(neuron_coeffs[0], classes))[:15]

[(-0.7630394872147194, 'wasn'),
 (-0.66275010517027, 'stupid'),
 (-0.6538136065196322, 'worst'),
 (-0.6422330125890985, 'not'),
 (-0.604507477666332, 'buying'),
 (-0.5901205619067208, 'return'),
 (-0.5885044463170324, 'average'),
 (-0.5874862326832408, 'poor'),
 (-0.5872603764703184, 'aren'),
 (-0.580792907420269, 'walked'),
 (-0.5785129127684874, 'terrible'),
 (-0.5732961250313777, 'dirty'),
 (-0.5699047228844576, 'unacceptable'),
 (-0.5673632135620347, 'appalling'),
 (-0.5659946048606436, 'shame')]

In [91]:
sorted(zip(neuron_coeffs[0], classes))[-15:]

[(0.6270782098699762, 'loved'),
 (0.6278590585113677, 'perfectly'),
 (0.6536399501908183, 'works'),
 (0.6542170986038114, 'beautiful'),
 (0.6619692874236018, 'nice'),
 (0.6711275719130444, 'happier'),
 (0.6737701984460487, 'perfect'),
 (0.6967352438799213, 'delicious'),
 (0.7026732979912862, 'awesome'),
 (0.7101717071043427, 'love'),
 (0.718937539128906, 'happy'),
 (0.7267619239642661, 'great'),
 (0.7304087290005354, 'fantastic'),
 (0.7335615171410247, '15'),
 (0.7792215661206249, 'fun')]

In [92]:
sorted(zip(neuron_coeffs[1], classes))[:15]

[(-0.9283218407816755, 'excellent'),
 (-0.8954682748572484, 'soundtrack'),
 (-0.7813679903142657, 'fun'),
 (-0.7717332907666229, 'fantastic'),
 (-0.7576919145227834, 'awesome'),
 (-0.7407247446962217, 'great'),
 (-0.7375872174716571, 'love'),
 (-0.725385643530019, 'happier'),
 (-0.7083775329884927, 'loved'),
 (-0.7050949382739558, 'happy'),
 (-0.6956002191476627, 'delicious'),
 (-0.6750923063762575, 'shot'),
 (-0.6583442825324721, 'beautiful'),
 (-0.6545067321687899, 'incredible'),
 (-0.6407549150747353, 'works')]

In [93]:
sorted(zip(neuron_coeffs[1], classes))[-15:]

[(0.6778294106769666, 'sucks'),
 (0.6811804443839085, 'stupid'),
 (0.684365876191325, 'sorry'),
 (0.7061005175790286, 'return'),
 (0.7196937724648249, 'dont'),
 (0.7240942725850775, 'bad'),
 (0.7305131137288031, 'disappointment'),
 (0.7342346971890951, 'junk'),
 (0.7383020140978965, 'waste'),
 (0.7440014142867309, 'not'),
 (0.7553925480429265, 'horrible'),
 (0.7570181843861779, 'disappointing'),
 (0.7573041141001385, 'worst'),
 (0.836770217173447, 'awful'),
 (0.8513160476013303, 'poor')]

In [94]:
sorted(zip(neuron_coeffs[2], classes))[:15]

[(-0.7791199621609849, 'wasn'),
 (-0.7132609758298067, 'not'),
 (-0.6939538169546453, 'stupid'),
 (-0.6523780422610894, 'average'),
 (-0.6434889430060745, 'terrible'),
 (-0.610620826022754, 'worst'),
 (-0.6053038415843243, 'unacceptable'),
 (-0.6040168627847691, 'shame'),
 (-0.6032820067147971, 'dirty'),
 (-0.6026693938329858, 'return'),
 (-0.6016884626546397, 'appalling'),
 (-0.601585987898355, 'difficult'),
 (-0.6009222137591077, 'buying'),
 (-0.5973578602431071, 'barking'),
 (-0.5920898686730958, 'boring')]

In [95]:
sorted(zip(neuron_coeffs[2], classes))[-15:]

[(0.677164422272534, 'perfectly'),
 (0.6814547431676861, '15'),
 (0.7053724454331985, 'perfect'),
 (0.7150998384609235, 'beautiful'),
 (0.7213270155167661, 'nice'),
 (0.7230344181235405, 'works'),
 (0.7339858285281635, 'delicious'),
 (0.7500294420499851, 'loved'),
 (0.7668629156288207, 'happy'),
 (0.7701231901702097, 'fantastic'),
 (0.782340433205249, 'love'),
 (0.7896734219922849, 'fun'),
 (0.7974092092604798, 'awesome'),
 (0.8020018282403096, 'great'),
 (0.8048940907703612, 'happier')]