In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import keras as ks

First we load our dataset

In [2]:
CUR_DIR = Path().absolute()
DATA_DIR = CUR_DIR / "sentiment labelled sentences"

df_list = []

for data_file in DATA_DIR.glob("*.txt"):
    if data_file.name.lower() == "readme.txt": continue
    
    df = pd.read_csv(data_file, names=['sentence', 'label'], sep='\t')
    df['source'] = data_file.name.split("_")[0]  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)

Just to verify, we show the data

In [3]:
df

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


We now try to use the "Bag of Words" way to extract features - we generate a way to represent presence of works as a vector.

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

feature_extractor = CountVectorizer(min_df=0.0)

feature_extractor.fit(df["sentence"])

feature_extractor.vocabulary_  # Shows us global frequency of words

{'so': 4161,
 'there': 4545,
 'is': 2427,
 'no': 3043,
 'way': 4987,
 'for': 1829,
 'me': 2809,
 'to': 4609,
 'plug': 3387,
 'it': 2432,
 'in': 2314,
 'here': 2158,
 'the': 4531,
 'us': 4835,
 'unless': 4793,
 'go': 1982,
 'by': 647,
 'converter': 1013,
 'good': 1993,
 'case': 713,
 'excellent': 1603,
 'value': 4861,
 'great': 2023,
 'jawbone': 2449,
 'tied': 4589,
 'charger': 768,
 'conversations': 1012,
 'lasting': 2567,
 'more': 2931,
 'than': 4526,
 '45': 45,
 'minutes': 2890,
 'major': 2753,
 'problems': 3482,
 'mic': 2868,
 'have': 2113,
 'jiggle': 2464,
 'get': 1954,
 'line': 2648,
 'up': 4821,
 'right': 3777,
 'decent': 1171,
 'volume': 4928,
 'if': 2282,
 'you': 5138,
 'several': 4004,
 'dozen': 1373,
 'or': 3144,
 'hundred': 2259,
 'contacts': 987,
 'then': 4542,
 'imagine': 2294,
 'fun': 1901,
 'of': 3097,
 'sending': 3968,
 'each': 1440,
 'them': 4538,
 'one': 3124,
 'are': 273,
 'razr': 3611,
 'owner': 3200,
 'must': 2980,
 'this': 4558,
 'needless': 3010,
 'say': 3889,
 '

So each sentence now has a 0-1 valued vector representing which of the words are used in the review.

Now, we try to see which words predict a positive review. First, we need to split the data into training and testing sets

In [5]:
from sklearn.model_selection import train_test_split

sentences_train, sentences_test, y_train, y_test = train_test_split(
    df["sentence"], df["label"], test_size=0.25, random_state=42069,
)

f"{len(sentences_train) = }, {len(sentences_test) = }"

'len(sentences_train) = 2061, len(sentences_test) = 687'

Now we try to train a simple logistic regression model

In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
x_train = feature_extractor.transform(sentences_train)

model.fit(x_train, y_train)

We show the scores for train and test sets

In [7]:
f"{model.score(x_train, y_train) = }"

'model.score(x_train, y_train) = 0.9805919456574479'

In [8]:
x_test = feature_extractor.transform(sentences_test)

f"{model.score(x_test, y_test) = }"

'model.score(x_test, y_test) = 0.8195050946142649'

In [49]:
i = np.eye(N=len(feature_extractor.vocabulary_))

probs = model.predict_proba(i)
_, prob_positive = zip(*probs)


classes = feature_extractor.inverse_transform(i)
classes = [str(c[0]) for c in classes]

prob_mappings = list(zip(classes, prob_positive))
prob_mappings

[('00', 0.43124711365982715),
 ('10', 0.6879812737532196),
 ('100', 0.4920992460384356),
 ('11', 0.40305887545310837),
 ('12', 0.39364227352152803),
 ('13', 0.3683906413842716),
 ('15', 0.6401632115862588),
 ('15g', 0.43124711365982715),
 ('15pm', 0.43124711365982715),
 ('17', 0.4020633263743111),
 ('18', 0.5012925903301431),
 ('18th', 0.4533950106932699),
 ('1928', 0.43124711365982715),
 ('1947', 0.4781027873996329),
 ('1948', 0.43124711365982715),
 ('1949', 0.4583565697588693),
 ('1971', 0.42420319907094617),
 ('1973', 0.46104392629704133),
 ('1979', 0.43124711365982715),
 ('1980', 0.4439429090669932),
 ('1986', 0.3928540790273433),
 ('1995', 0.43124711365982715),
 ('1998', 0.39275368208011907),
 ('20', 0.41185133280321756),
 ('2000', 0.43124711365982715),
 ('2005', 0.5258470700487837),
 ('2006', 0.43124711365982715),
 ('2007', 0.4472607970306439),
 ('20th', 0.4722879566356427),
 ('2160', 0.4013046780562655),
 ('23', 0.491012071091429),
 ('24', 0.45619059725140854),
 ('25', 0.4274175

In [50]:
coefficient_scores = sorted(prob_mappings, key=lambda x: x[1])

# Best 10 words indicating a good review
coefficient_scores[-10:]

[('works', 0.7958313153725932),
 ('delicious', 0.8026238688250893),
 ('best', 0.8030907111321334),
 ('fantastic', 0.8044556210045531),
 ('good', 0.8161418569674062),
 ('awesome', 0.8161835912257326),
 ('nice', 0.829244508998832),
 ('love', 0.8552618882576929),
 ('excellent', 0.8580505383729834),
 ('great', 0.9333362886643681)]

In [51]:
coefficient_scores[:10]

[('not', 0.06694448940774994),
 ('bad', 0.06757972712958861),
 ('worst', 0.09672428409304125),
 ('poor', 0.10004511026331316),
 ('terrible', 0.1374984907188718),
 ('awful', 0.13884099403773553),
 ('then', 0.1501592109160489),
 ('wasn', 0.16104780470019392),
 ('too', 0.1622993730619834),
 ('slow', 0.16698284468473112)]